Mercurial > repos > devteam > rcve
changeset 0:7740956d197b draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Tue, 01 Apr 2014 10:52:35 -0400 |
parents | |
children | |
files | rcve.py rcve.xml test-data/rcve_out.dat test-data/reg_inp.tab tool_dependencies.xml |
diffstat | 5 files changed, 337 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rcve.py Tue Apr 01 10:52:35 2014 -0400 @@ -0,0 +1,142 @@ +#!/usr/bin/env python + +import sys +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + + +def sscombs(s): + if len(s) == 1: + return [s] + else: + ssc = sscombs(s[1:]) + return [s[0]] + [s[0]+comb for comb in ssc] + ssc + + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] + +print "Predictor columns: %s; Response column: %d" % ( x_cols, y_col+1 ) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k, col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + """ + try: + float( elems[x_cols[k]] ) + except: + try: + msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." % ( col, elems[x_cols[k]] ) + except: + msg = "This operation cannot be performed on non-numeric data." + stop_err( msg ) + """ +NA = 'NA' +for ind, line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except Exception, ey: + yval = r('NA') + #print >>sys.stderr, "ey = %s" %ey + y_vals.append(yval) + for k, col in enumerate(x_cols): + try: + xval = float(fields[col]) + except Exception, ex: + xval = r('NA') + #print >>sys.stderr, "ex = %s" %ex + x_vals[k].append(xval) + except: + pass + +x_vals1 = numpy.asarray(x_vals).transpose() +dat = r.list( x=array(x_vals1), y=y_vals ) + +set_default_mode(NO_CONVERSION) +try: + full = r.lm( r("y ~ x"), data=r.na_exclude(dat) ) #full model includes all the predictor variables specified by the user +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") +set_default_mode(BASIC_CONVERSION) + +summary = r.summary(full) +fullr2 = summary.get('r.squared','NA') + +if fullr2 == 'NA': + stop_err("Error in linear regression") + +if len(x_vals) < 10: + s = "" + for ch in range(len(x_vals)): + s += str(ch) +else: + stop_err("This tool only works with less than 10 predictors.") + +print >> fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value" +all_combos = sorted(sscombs(s), key=len) +all_combos.reverse() +for j, cols in enumerate(all_combos): + #if len(cols) == len(s): #Same as the full model above + # continue + if len(cols) == 1: + x_vals1 = x_vals[int(cols)] + else: + x_v = [] + for col in cols: + x_v.append(x_vals[int(col)]) + x_vals1 = numpy.asarray(x_v).transpose() + dat = r.list(x=array(x_vals1), y=y_vals) + set_default_mode(NO_CONVERSION) + red = r.lm(r("y ~ x"), data= dat) #Reduced model + set_default_mode(BASIC_CONVERSION) + summary = r.summary(red) + redr2 = summary.get('r.squared','NA') + try: + rcve = (float(fullr2)-float(redr2))/float(fullr2) + except: + rcve = 'NA' + col_str = "" + for col in cols: + col_str = col_str + str(int(x_cols[int(col)]) + 1) + " " + col_str.strip() + rcve_col_str = "" + for col in s: + if col not in cols: + rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " " + rcve_col_str.strip() + if len(cols) == len(s): #full model + rcve_col_str = "-" + rcve = "-" + try: + redr2 = "%.4f" % (float(redr2)) + except: + pass + try: + rcve = "%.4f" % (float(rcve)) + except: + pass + print >> fout, "%s\t%s\t%s\t%s" % ( col_str, redr2, rcve_col_str, rcve )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rcve.xml Tue Apr 01 10:52:35 2014 -0400 @@ -0,0 +1,75 @@ +<tool id="rcve1" name="Compute RCVE" version="1.0.0"> + <description> </description> + <requirements> + <requirement type="package" version="2.11.0">R</requirement> + <requirement type="package" version="1.7.1">numpy</requirement> + <requirement type="package" version="1.0.3">rpy</requirement> + </requirements> + <command interpreter="python"> + rcve.py + $input1 + $response_col + $predictor_cols + $out_file1 + 1>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true"> + <validator type="no_options" message="Please select at least one column."/> + </param> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> + <!-- Test data with vlid values --> + <test> + <param name="input1" value="reg_inp.tab"/> + <param name="response_col" value="1"/> + <param name="predictor_cols" value="2,3,4"/> + <output name="out_file1" file="rcve_out.dat"/> + </test> + + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool computes the RCVE (Relative Contribution to Variance) for all possible variable subsets using the following formula: + +**RCVE(i) = [R-sq (full: 1,2,..,i..,p-1) - R-sq(without i: 1,2,...,p-1)] / R-sq (full: 1,2,..,i..,p-1)**, +which denotes the case where the 'i'th predictor is dropped. + + +In general, +**RCVE(X+) = [R-sq (full: {X,X+}) - R-sq(reduced: {X})] / R-sq (full: {X,X+})**, +where, + +- {X,X+} denotes the set of all predictors, +- X+ is the set of predictors for which we compute RCVE (and therefore drop from the full model to obtain a reduced one), +- {X} is the set of the predictors that are left in the reduced model after excluding {X+} + + +The 4 columns in the output are described below: + +- Column 1 (Model): denotes the variables present in the model ({X}) +- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1 +- Column 3 (RCVE_Terms): denotes the variable/s for which RCVE is computed ({X+}). These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model. +- Column 4 (RCVE): denotes the RCVE value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model. + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rcve_out.dat Tue Apr 01 10:52:35 2014 -0400 @@ -0,0 +1,8 @@ +#Model R-sq RCVE_Terms RCVE_Value +2 3 4 0.3997 - - +3 4 0.3319 2 0.1697 +2 4 0.2974 3 0.2561 +2 3 0.3985 4 0.0031 +4 0.1226 2 3 0.6934 +3 0.2733 2 4 0.3164 +2 0.2972 3 4 0.2564
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reg_inp.tab Tue Apr 01 10:52:35 2014 -0400 @@ -0,0 +1,100 @@ +2.04 2.01 1070 5 +2.56 3.40 1254 6 +3.75 3.68 1466 6 +1.10 1.54 706 4 +3.00 3.32 1160 5 +0.05 0.33 756 3 +1.38 0.36 1058 2 +1.50 1.97 1008 7 +1.38 2.03 1104 4 +4.01 2.05 1200 7 +1.50 2.13 896 7 +1.29 1.34 848 3 +1.90 1.51 958 5 +3.11 3.12 1246 6 +1.92 2.14 1106 4 +0.81 2.60 790 5 +1.01 1.90 954 4 +3.66 3.06 1500 6 +2.00 1.60 1046 5 +2.05 1.96 1054 4 +2.60 1.96 1198 6 +2.55 1.56 940 3 +0.38 1.60 456 6 +2.48 1.92 1150 7 +2.74 3.09 636 6 +1.77 0.78 744 5 +1.61 2.12 644 5 +0.99 1.85 842 3 +1.62 1.78 852 5 +2.03 1.03 1170 3 +3.50 3.44 1034 10 +3.18 2.42 1202 5 +2.39 1.74 1018 5 +1.48 1.89 1180 5 +1.54 1.43 952 3 +1.57 1.64 1038 4 +2.46 2.69 1090 6 +2.42 1.79 694 5 +2.11 2.72 1096 6 +2.04 2.15 1114 5 +1.68 2.22 1256 6 +1.64 1.55 1208 5 +2.41 2.34 820 6 +2.10 2.92 1222 4 +1.40 2.10 1120 5 +2.03 1.64 886 4 +1.99 2.83 1126 7 +2.24 1.76 1158 4 +0.45 1.81 676 6 +2.31 2.68 1214 7 +2.41 2.55 1136 6 +2.56 2.70 1264 6 +2.50 1.66 1116 3 +2.92 2.23 1292 4 +2.35 2.01 604 5 +2.82 1.24 854 6 +1.80 1.95 814 6 +1.29 1.73 778 3 +1.68 1.08 800 2 +3.44 3.46 1424 7 +1.90 3.01 950 6 +2.06 0.54 1056 3 +3.30 3.20 956 8 +1.80 1.50 1352 5 +2.00 1.71 852 5 +1.68 1.99 1168 5 +1.94 2.76 970 6 +0.97 1.56 776 4 +1.12 1.78 854 6 +1.31 1.32 1232 5 +1.68 0.87 1140 6 +3.09 1.75 1084 4 +1.87 1.41 954 2 +2.00 2.77 1000 4 +2.39 1.78 1084 4 +1.50 1.34 1058 4 +1.82 1.52 816 5 +1.80 2.97 1146 7 +2.01 1.75 1000 6 +1.88 1.64 856 4 +1.64 1.80 798 4 +2.42 3.37 1324 6 +0.22 1.15 704 6 +2.31 1.72 1222 5 +0.95 2.27 948 6 +1.99 2.85 1182 8 +1.86 2.21 1000 6 +1.79 1.94 910 6 +3.02 4.25 1374 9 +1.85 1.83 1014 6 +1.98 2.75 1420 7 +2.15 1.71 400 6 +1.46 2.20 998 7 +2.29 2.13 776 6 +2.39 2.38 1134 7 +1.80 1.64 772 4 +2.64 1.87 1304 6 +2.08 2.53 1212 4 +0.70 1.78 818 6 +0.89 1.20 864 2 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Apr 01 10:52:35 2014 -0400 @@ -0,0 +1,12 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="numpy" version="1.7.1"> + <repository changeset_revision="0c288abd2a1e" name="package_numpy_1_7" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="rpy" version="1.0.3"> + <repository changeset_revision="82170c94ca7c" name="package_rpy_1_0_3" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="R" version="2.11.0"> + <repository changeset_revision="6e1b17857732" name="package_r_2_11_0" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>