comparison tools/stats/cor.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2 #Greg Von Kuster
3 """
4 Calculate correlations between numeric columns in a tab delim file.
5 usage: %prog infile output.txt columns method
6 """
7
8 import sys
9 from rpy import *
10
11 def stop_err(msg):
12 sys.stderr.write(msg)
13 sys.exit()
14
15 def main():
16 method = sys.argv[4]
17 assert method in ( "pearson", "kendall", "spearman" )
18
19 try:
20 columns = map( int, sys.argv[3].split( ',' ) )
21 except:
22 stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." )
23
24 matrix = []
25 skipped_lines = 0
26 first_invalid_line = 0
27 invalid_value = ''
28 invalid_column = 0
29
30 for i, line in enumerate( file( sys.argv[1] ) ):
31 valid = True
32 line = line.rstrip('\n\r')
33
34 if line and not line.startswith( '#' ):
35 # Extract values and convert to floats
36 row = []
37 for column in columns:
38 column -= 1
39 fields = line.split( "\t" )
40 if len( fields ) <= column:
41 valid = False
42 else:
43 val = fields[column]
44 if val.lower() == "na":
45 row.append( float( "nan" ) )
46 else:
47 try:
48 row.append( float( fields[column] ) )
49 except:
50 valid = False
51 skipped_lines += 1
52 if not first_invalid_line:
53 first_invalid_line = i+1
54 invalid_value = fields[column]
55 invalid_column = column+1
56 else:
57 valid = False
58 skipped_lines += 1
59 if not first_invalid_line:
60 first_invalid_line = i+1
61
62 if valid:
63 matrix.append( row )
64
65 if skipped_lines < i:
66 try:
67 out = open( sys.argv[2], "w" )
68 except:
69 stop_err( "Unable to open output file" )
70
71 # Run correlation
72 try:
73 value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method )
74 except Exception, exc:
75 out.close()
76 stop_err("%s" %str( exc ))
77 for row in value:
78 print >> out, "\t".join( map( str, row ) )
79 out.close()
80
81 if skipped_lines > 0:
82 msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line )
83 if invalid_value and invalid_column > 0:
84 msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column )
85 print msg
86
87 if __name__ == "__main__":
88 main()