# HG changeset patch # User devteam # Date 1396363886 14400 # Node ID bd196d7c1ca96977e74d7f7db37d751c673eef29 Imported from capsule None diff -r 000000000000 -r bd196d7c1ca9 logistic_regression_vif.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/logistic_regression_vif.py Tue Apr 01 10:51:26 2014 -0400 @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +import sys +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] + + +print "Predictor columns: %s; Response column: %d" % ( x_cols, y_col+1 ) +fout = open(outfile,'w') +elems = [] +for i, line in enumerate( file( infile ) ): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k, col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + +NA = 'NA' +for ind, line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except: + yval = r('NA') + y_vals.append(yval) + for k, col in enumerate(x_cols): + try: + xval = float(fields[col]) + except: + xval = r('NA') + x_vals[k].append(xval) + except: + pass + +x_vals1 = numpy.asarray(x_vals).transpose() + +check1 = 0 +check0 = 0 +for i in y_vals: + if i == 1: + check1 = 1 + if i == 0: + check0 = 1 +if check1 == 0 or check0 == 0: + sys.exit("Warning: logistic regression must have at least two classes") + +for i in y_vals: + if i not in [1, 0, r('NA')]: + print >> fout, str(i) + sys.exit("Warning: the current version of this tool can run only with two classes and need to be labeled as 0 and 1.") + +dat = r.list(x=array(x_vals1), y=y_vals) +novif = 0 +set_default_mode(NO_CONVERSION) +try: + linear_model = r.glm(r("y ~ x"), data=r.na_exclude(dat), family="binomial") +except RException, rex: + stop_err("Error performing logistic regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") +if len(x_cols)>1: + try: + r('suppressPackageStartupMessages(library(car))') + r.assign('dat', dat) + r.assign('ncols', len(x_cols)) + vif = r.vif(r('glm(dat$y ~ ., data = na.exclude(data.frame(as.matrix(dat$x,ncol=ncols))->datx), family="binomial")')) + except RException, rex: + print rex +else: + novif = 1 + +set_default_mode(BASIC_CONVERSION) + +coeffs = linear_model.as_py()['coefficients'] +null_deviance = linear_model.as_py()['null.deviance'] +residual_deviance = linear_model.as_py()['deviance'] +yintercept = coeffs['(Intercept)'] +summary = r.summary(linear_model) +co = summary.get('coefficients', 'NA') +""" +if len(co) != len(x_vals)+1: + stop_err("Stopped performing logistic regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.") +""" + +try: + yintercept = r.round(float(yintercept), digits=10) + pvaly = r.round(float(co[0][3]), digits=10) +except: + pass +print >> fout, "response column\tc%d" % (y_col+1) +tempP = [] +for i in x_cols: + tempP.append('c'+str(i+1)) +tempP = ','.join(tempP) +print >> fout, "predictor column(s)\t%s" % (tempP) +print >> fout, "Y-intercept\t%s" % (yintercept) +print >> fout, "p-value (Y-intercept)\t%s" % (pvaly) + +if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable + try: + slope = r.round(float(coeffs['x']), digits=10) + except: + slope = 'NA' + try: + pval = r.round(float(co[1][3]), digits=10) + except: + pval = 'NA' + print >> fout, "Slope (c%d)\t%s" % ( x_cols[0]+1, slope ) + print >> fout, "p-value (c%d)\t%s" % ( x_cols[0]+1, pval ) +else: #Multiple regression case with >1 predictors + ind = 1 + while ind < len(coeffs.keys()): + try: + slope = r.round(float(coeffs['x'+str(ind)]), digits=10) + except: + slope = 'NA' + print >> fout, "Slope (c%d)\t%s" % ( x_cols[ind-1]+1, slope ) + try: + pval = r.round(float(co[ind][3]), digits=10) + except: + pval = 'NA' + print >> fout, "p-value (c%d)\t%s" % ( x_cols[ind-1]+1, pval ) + ind += 1 + +rsq = summary.get('r.squared','NA') + +try: + rsq = r.round(float((null_deviance-residual_deviance)/null_deviance), digits=5) + null_deviance = r.round(float(null_deviance), digits=5) + residual_deviance = r.round(float(residual_deviance), digits=5) +except: + pass + +print >> fout, "Null deviance\t%s" % (null_deviance) +print >> fout, "Residual deviance\t%s" % (residual_deviance) +print >> fout, "pseudo R-squared\t%s" % (rsq) +print >> fout, "\n" +print >> fout, 'vif' + +if novif == 0: + py_vif = vif.as_py() + count = 0 + for i in sorted(py_vif.keys()): + print >> fout, 'c'+str(x_cols[count]+1), str(py_vif[i]) + count += 1 +elif novif == 1: + print >> fout, "vif can calculate only when model have more than 1 predictor" diff -r 000000000000 -r bd196d7c1ca9 logistic_regression_vif.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/logistic_regression_vif.xml Tue Apr 01 10:51:26 2014 -0400 @@ -0,0 +1,79 @@ + + + + numpy + rpy + R + + + logistic_regression_vif.py + $input1 + $response_col + $predictor_cols + $out_file1 + 1>/dev/null + + + + + + + + + + + + + + rpy + + + + + + + + + + + + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Datasets->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool uses the **'glm'** function from R statistical package to perform logistic regression on the input data. It outputs one file containing the summary statistics of the performed regression. Also, it calculates VIF(Variance Inflation Factor) with **'vif'** function from library (car) in R. + + +*R Development Core Team (2010). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. ISBN 3-900051-07-0, URL http://www.R-project.org.* + +----- + +.. class:: warningmark + +**Note** + +- This tool currently treats all predictor variables as continuous numeric variables and response variable as categorical variable. Currently, the response variable can have only two classes, namely 0 and 1. The program will take 0 as base class. + +- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. + +- The summary statistics in the output are described below: + +- Pseudo R-squared: the proportion of model improvement from null model +- p-value: p-value for the z-test of the null hypothesis that the corresponding slope is equal to zero against the two-sided alternative. +- Coefficient indicates log ratio of (probability to be class 1 / probability to be class 0) + +- This tool also provides **Variance Inflation Factor or VIF** which quantifies the level of multicollinearity. The tool will automatic generate VIF if the model has more than one predictor. The higher the VIF, the higher is the multicollinearity. Multicollinearity will inflate standard error and reduce level of significance of the predictor. In the worst case, it can reverse direction of slope for highly correlated predictors if one of them is significant. A general thumb-rule is to use those predictors having VIF lower than 10 or 5. +- **vif** is calculated by + - First, regressing each predictor over all other predictors, and recording R-squared for each regression. + - Second, computing vif as 1/(1- R_squared) + + + diff -r 000000000000 -r bd196d7c1ca9 test-data/2.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/2.tabular Tue Apr 01 10:51:26 2014 -0400 @@ -0,0 +1,10 @@ +1 68 4.1 +2 71 4.6 +3 62 3.8 +4 75 4.4 +5 58 3.2 +6 60 3.1 +7 67 3.8 +8 68 4.1 +9 71 4.3 +10 69 3.7 diff -r 000000000000 -r bd196d7c1ca9 test-data/logreg_inp.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/logreg_inp.tabular Tue Apr 01 10:51:26 2014 -0400 @@ -0,0 +1,100 @@ +2.04 2.01 1070 1 +2.56 3.4 1254 1 +3.75 3.68 1466 1 +1.1 1.54 706 1 +3 3.32 1160 1 +0.05 0.33 756 1 +1.38 0.36 1058 1 +1.5 1.97 1008 0 +1.38 2.03 1104 1 +4.01 2.05 1200 1 +1.5 2.13 896 1 +1.29 1.34 848 1 +1.9 1.51 958 1 +3.11 3.12 1246 0 +1.92 2.14 1106 1 +0.81 2.6 790 1 +1.01 1.9 954 1 +3.66 3.06 1500 0 +2 1.6 1046 0 +2.05 1.96 1054 1 +2.6 1.96 1198 0 +2.55 1.56 940 1 +0.38 1.6 456 0 +2.48 1.92 1150 1 +2.74 3.09 636 0 +1.77 0.78 744 1 +1.61 2.12 644 0 +0.99 1.85 842 1 +1.62 1.78 852 1 +2.03 1.03 1170 0 +3.5 3.44 1034 1 +3.18 2.42 1202 1 +2.39 1.74 1018 1 +1.48 1.89 1180 1 +1.54 1.43 952 0 +1.57 1.64 1038 1 +2.46 2.69 1090 0 +2.42 1.79 694 0 +2.11 2.72 1096 0 +2.04 2.15 1114 0 +1.68 2.22 1256 1 +1.64 1.55 1208 0 +2.41 2.34 820 0 +2.1 2.92 1222 0 +1.4 2.1 1120 0 +2.03 1.64 886 0 +1.99 2.83 1126 0 +2.24 1.76 1158 0 +0.45 1.81 676 0 +2.31 2.68 1214 0 +2.41 2.55 1136 1 +2.56 2.7 1264 0 +2.5 1.66 1116 1 +2.92 2.23 1292 1 +2.35 2.01 604 1 +2.82 1.24 854 1 +1.8 1.95 814 0 +1.29 1.73 778 1 +1.68 1.08 800 0 +3.44 3.46 1424 0 +1.9 3.01 950 0 +2.06 0.54 1056 1 +3.3 3.2 956 1 +1.8 1.5 1352 1 +2 1.71 852 1 +1.68 1.99 1168 0 +1.94 2.76 970 1 +0.97 1.56 776 1 +1.12 1.78 854 1 +1.31 1.32 1232 0 +1.68 0.87 1140 0 +3.09 1.75 1084 0 +1.87 1.41 954 0 +2 2.77 1000 0 +2.39 1.78 1084 0 +1.5 1.34 1058 0 +1.82 1.52 816 0 +1.8 2.97 1146 0 +2.01 1.75 1000 1 +1.88 1.64 856 1 +1.64 1.8 798 1 +2.42 3.37 1324 1 +0.22 1.15 704 1 +2.31 1.72 1222 1 +0.95 2.27 948 0 +1.99 2.85 1182 0 +1.86 2.21 1000 1 +1.79 1.94 910 0 +3.02 4.25 1374 1 +1.85 1.83 1014 0 +1.98 2.75 1420 0 +2.15 1.71 400 0 +1.46 2.2 998 1 +2.29 2.13 776 1 +2.39 2.38 1134 0 +1.8 1.64 772 0 +2.64 1.87 1304 0 +2.08 2.53 1212 0 +0.7 1.78 818 1 +0.89 1.2 864 1 \ No newline at end of file diff -r 000000000000 -r bd196d7c1ca9 test-data/logreg_out2.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/logreg_out2.tabular Tue Apr 01 10:51:26 2014 -0400 @@ -0,0 +1,19 @@ +response column c4 +predictor column(s) c1,c2,c3 +Y-intercept 0.9111624714 +p-value (Y-intercept) 0.3571052008 +Slope (c1) 0.057995684 +p-value (c1) 0.8677866885 +Slope (c2) -0.2319990287 +p-value (c2) 0.4986584837 +Slope (c3) -0.0004633556 +p-value (c3) 0.6785709433 +Null deviance 138.46939 +Residual deviance 137.44023 +pseudo R-squared 0.00743 + + +vif +c1 1.65649272465 +c2 1.47696547452 +c3 1.4307725027 diff -r 000000000000 -r bd196d7c1ca9 test-data/reg_inp.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reg_inp.tab Tue Apr 01 10:51:26 2014 -0400 @@ -0,0 +1,100 @@ +2.04 2.01 1070 5 +2.56 3.40 1254 6 +3.75 3.68 1466 6 +1.10 1.54 706 4 +3.00 3.32 1160 5 +0.05 0.33 756 3 +1.38 0.36 1058 2 +1.50 1.97 1008 7 +1.38 2.03 1104 4 +4.01 2.05 1200 7 +1.50 2.13 896 7 +1.29 1.34 848 3 +1.90 1.51 958 5 +3.11 3.12 1246 6 +1.92 2.14 1106 4 +0.81 2.60 790 5 +1.01 1.90 954 4 +3.66 3.06 1500 6 +2.00 1.60 1046 5 +2.05 1.96 1054 4 +2.60 1.96 1198 6 +2.55 1.56 940 3 +0.38 1.60 456 6 +2.48 1.92 1150 7 +2.74 3.09 636 6 +1.77 0.78 744 5 +1.61 2.12 644 5 +0.99 1.85 842 3 +1.62 1.78 852 5 +2.03 1.03 1170 3 +3.50 3.44 1034 10 +3.18 2.42 1202 5 +2.39 1.74 1018 5 +1.48 1.89 1180 5 +1.54 1.43 952 3 +1.57 1.64 1038 4 +2.46 2.69 1090 6 +2.42 1.79 694 5 +2.11 2.72 1096 6 +2.04 2.15 1114 5 +1.68 2.22 1256 6 +1.64 1.55 1208 5 +2.41 2.34 820 6 +2.10 2.92 1222 4 +1.40 2.10 1120 5 +2.03 1.64 886 4 +1.99 2.83 1126 7 +2.24 1.76 1158 4 +0.45 1.81 676 6 +2.31 2.68 1214 7 +2.41 2.55 1136 6 +2.56 2.70 1264 6 +2.50 1.66 1116 3 +2.92 2.23 1292 4 +2.35 2.01 604 5 +2.82 1.24 854 6 +1.80 1.95 814 6 +1.29 1.73 778 3 +1.68 1.08 800 2 +3.44 3.46 1424 7 +1.90 3.01 950 6 +2.06 0.54 1056 3 +3.30 3.20 956 8 +1.80 1.50 1352 5 +2.00 1.71 852 5 +1.68 1.99 1168 5 +1.94 2.76 970 6 +0.97 1.56 776 4 +1.12 1.78 854 6 +1.31 1.32 1232 5 +1.68 0.87 1140 6 +3.09 1.75 1084 4 +1.87 1.41 954 2 +2.00 2.77 1000 4 +2.39 1.78 1084 4 +1.50 1.34 1058 4 +1.82 1.52 816 5 +1.80 2.97 1146 7 +2.01 1.75 1000 6 +1.88 1.64 856 4 +1.64 1.80 798 4 +2.42 3.37 1324 6 +0.22 1.15 704 6 +2.31 1.72 1222 5 +0.95 2.27 948 6 +1.99 2.85 1182 8 +1.86 2.21 1000 6 +1.79 1.94 910 6 +3.02 4.25 1374 9 +1.85 1.83 1014 6 +1.98 2.75 1420 7 +2.15 1.71 400 6 +1.46 2.20 998 7 +2.29 2.13 776 6 +2.39 2.38 1134 7 +1.80 1.64 772 4 +2.64 1.87 1304 6 +2.08 2.53 1212 4 +0.70 1.78 818 6 +0.89 1.20 864 2 \ No newline at end of file diff -r 000000000000 -r bd196d7c1ca9 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Apr 01 10:51:26 2014 -0400 @@ -0,0 +1,12 @@ + + + + + + + + + + + +