annotate maaslin-4450aa4ecc84/src/PCLToGraphlanCoreGene.py @ 1:a87d5a5f2776

Uploaded the version running on the prod server
author george-weingart
date Sun, 08 Feb 2015 23:08:38 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
1 #!/usr/bin/env python
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
2 #####################################################################################
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
3 #Copyright (C) <2012>
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
4 #
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
5 #Permission is hereby granted, free of charge, to any person obtaining a copy of
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
6 #this software and associated documentation files (the "Software"), to deal in the
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
7 #Software without restriction, including without limitation the rights to use, copy,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
8 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
9 #and to permit persons to whom the Software is furnished to do so, subject to
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
10 #the following conditions:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
11 #
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
12 #The above copyright notice and this permission notice shall be included in all copies
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
13 #or substantial portions of the Software.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
14 #
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
15 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
16 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
17 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
18 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
19 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
20 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
21 #
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
22 # This file is a component of the MaAsLin (Multivariate Associations Using Linear Models),
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
23 # authored by the Huttenhower lab at the Harvard School of Public Health
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
24 # (contact Timothy Tickle, ttickle@hsph.harvard.edu).
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
25 #####################################################################################
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
26
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
27 __author__ = "Timothy Tickle"
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
28 __copyright__ = "Copyright 2012"
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
29 __credits__ = ["Timothy Tickle"]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
30 __license__ = ""
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
31 __version__ = ""
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
32 __maintainer__ = "Timothy Tickle"
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
33 __email__ = "ttickle@sph.harvard.edu"
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
34 __status__ = "Development"
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
35
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
36 import argparse
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
37 import csv
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
38 from operator import itemgetter
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
39 import re
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
40 import sys
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
41
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
42 #Helper function which returns a boolean indicator of an input string being parsable as an int
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
43 def funcIsInt(strInt):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
44 try:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
45 int(strInt)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
46 return True
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
47 except:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
48 return False
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
49
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
50 #Helper function that gets the index of the name and gives the last value of the list for - or the first value depending on the position
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
51 # This supports the ranging in the read.config files
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
52 #If no range is given then the result is just one index of the given name
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
53 def funcGetIndices(lsFeature, lsFunctionNames):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
54 if(len(lsFeature)) == 1:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
55 if(funcIsInt(lsFeature[0])):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
56 return int(lsFeature[0])-1
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
57 return [lsFeatureNames.index(lsFeature[0])]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
58 if(len(lsFeature)) == 2:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
59 iIndices = []
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
60 iPosition = 1
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
61 for sFeature in lsFeature:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
62 if(sFeature==""):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
63 if(iPosition==1):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
64 iIndices.append(2)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
65 elif(iPosition==2):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
66 iIndices.append(len(lsFunctionNames)-1)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
67 elif(funcIsInt(sFeature)):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
68 iIndices.append(int(sFeature)-1)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
69 else:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
70 iIndices.append(lsFeatureNames.index(sFeature))
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
71 iPosition = iPosition + 1
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
72 return iIndices
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
73
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
74 #Constants
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
75 #The line indicating the rows to read
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
76 c_MatrixName = "Matrix:"
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
77 c_DataMatrix = "Abundance"
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
78 c_strRows = "Read_PCL_Rows:"
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
79
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
80 #Set up arguments reader
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
81 argp = argparse.ArgumentParser( prog = "PCLToGraphlanCoreGene.py",
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
82 description = """Converts PCL files to Graphlan core gene files.""" )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
83
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
84 #Arguments
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
85 argp.add_argument("strInputPCL", metavar = "PCLFile", type = argparse.FileType("r"), help ="Input PCl file used in maaslin")
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
86 argp.add_argument("strInputRC", metavar = "RCFile", type = argparse.FileType("r"), help ="Input read config file used in maaslin")
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
87 argp.add_argument("strOutputCoreGene", metavar = "CoreGeneFile", type = argparse.FileType("w"), help ="Output core gene file for graphlan")
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
88
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
89 args = argp.parse_args( )
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
90
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
91 #Read in read config table and get the rows/columns to use
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
92 #Indicates if we are reading a data matrix
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
93 fIsData = False
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
94 #Holds the indices ranges
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
95 #List of lists,each internal list hold 1 or 2 indices, if two it indicates a range from the first to the second
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
96 llsIndices = []
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
97 csvRC = open(args.strInputRC,'r') if isinstance(args.strInputRC, str) else args.strInputRC
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
98 fRC = csv.reader(csvRC, delimiter=" ")
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
99 for sLine in fRC:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
100 #Get the row indices or names
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
101 if len(sLine):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
102 if sLine[0] == c_MatrixName:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
103 fIsData = sLine[1] == c_DataMatrix
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
104 if sLine[0] == c_strRows:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
105 if fIsData:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
106 llsIndices = [sIndexRange.split("-") for sIndexRange in sLine[1].split(",")]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
107 break
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
108 csvRC.close()
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
109
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
110 # Check to make sure RC file is read
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
111 if len(llsIndices)==0:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
112 print("PCLToGraphlanCoreGene:: Could Not find indices in RC file "+args.strInputRC+".")
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
113
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
114 #Read in the PCL file and parse the file names to core genes format
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
115 csvPCL = open(args.strInputPCL,'r') if isinstance(args.strInputPCL, str) else args.strInputPCL
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
116 fPCL = csv.reader(csvPCL,delimiter="\t")
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
117 #The first column of the csv file
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
118 lsFeatureNames = [sLine[0] for sLine in fPCL]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
119 csvPCL.close()
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
120
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
121 # Check to make sure PCL file is read
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
122 if len(lsFeatureNames)==0:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
123 print("PCLToGraphlanCoreGene:: Could Not find features in PCL file "+args.strInputPCL+".")
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
124
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
125 #If the indices are names switch with numbers otherwise subtract 1 because they are ment for R
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
126 liConvertedRangedIndices = [funcGetIndices(sIndex,lsFeatureNames) for sIndex in llsIndices] if len(llsIndices)>0 else []
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
127 llsIndices = None
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
128
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
129 #If there are any ranges, reduce to lists of indices
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
130 liConvertedIndices = []
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
131 for lsIndices in liConvertedRangedIndices:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
132 lsIndices.sort()
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
133 iLenIndices = len(lsIndices)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
134 if iLenIndices > 2:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
135 print "Error, received more than 2 indices in a range. Stopped."
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
136 exit()
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
137 liConvertedIndices.extend(lsIndices if iLenIndices == 1 else range(lsIndices[0],lsIndices[1]+1))
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
138 liConvertedRangedIndices = None
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
139
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
140 #Collapse all indices to a set which is then sorted
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
141 liConvertedIndices = sorted(list(set(liConvertedIndices)))
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
142
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
143 #Reduce name of features to just bugs indicated by indices
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
144 lsFeatureNames = itemgetter(*liConvertedIndices)(lsFeatureNames)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
145 liConvertedIndices = None
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
146
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
147 #Change the bug names to the correct formatting (clades seperated by .)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
148 lsFeatureNames = sorted(lsFeatureNames)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
149 lsFeatureNames = [re.sub("^[A-Za-z]__","",sBug) for sBug in lsFeatureNames]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
150 lsFeatureNames = [[re.sub("\|*[A-Za-z]__|\|",".",sBug)] for sBug in lsFeatureNames]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
151
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
152 #If this is an OTU, append the number and the genus level together for a more descriptive termal name
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
153 lsFeatureNamesModForOTU = []
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
154 for sBug in lsFeatureNames:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
155 lsBug = sBug[0].split(".")
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
156 if(len(lsBug))> 1:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
157 if(lsBug[-1].isdigit()):
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
158 lsBug[-2]=lsBug[-2]+"_"+lsBug[-1]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
159 lsBug = lsBug[0:-1]
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
160 lsFeatureNamesModForOTU.append([".".join(lsBug)])
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
161 else:
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
162 lsFeatureNamesModForOTU.append([lsBug[0]])
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
163
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
164 #Output core gene file
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
165 csvCG = open(args.strOutputCoreGene,'w') if isinstance(args.strOutputCoreGene, str) else args.strOutputCoreGene
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
166 fCG = csv.writer(csvCG)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
167 fCG.writerows(lsFeatureNamesModForOTU)
a87d5a5f2776 Uploaded the version running on the prod server
george-weingart
parents:
diff changeset
168 csvCG.close()