comparison bin/design_matrix_creator @ 1:a4a4c88783ea draft

planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/edger_with_design_matrix commit 2700e500a4fb135a20ede7d52221a9d31f1aaa5e-dirty
author yhoogstrate
date Tue, 01 Sep 2015 04:59:05 -0400
parents
children
comparison
equal deleted inserted replaced
0:acb56603b268 1:a4a4c88783ea
1 #!/usr/bin/env python
2
3 import argparse, os, shutil, sys, tempfile, subprocess
4
5 __version_info__ = ('1', '0', '0')#, 'beta')
6 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3]
7 __author__ = 'Youri Hoogstrate'
8 __homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools'
9 __license__ = 'GNU General Public License v3 (GPLv3)'
10
11
12 class sampleContainer:
13 def __init__(self):
14 self.samples = []
15 self.treatments = {}
16 self.treatment_index = []
17 self.treatment_types = {}
18
19 def do_decode(self,encoded_str):
20 return encoded_str.decode("base64").strip().replace("\t",'')
21
22 def add_samples(self,argument):
23 print " - Adding samples"
24 for sample in argument:
25 self.add_sample(self.do_decode(sample))
26
27 def add_sample(self,sample):
28 if(sample in self.samples):
29 sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n")
30 sys.exit(1)
31 else:
32 self.samples.append(sample)
33 print " - Added: "+sample
34
35 def add_blocking(self,argument):
36 print " - Adding paired samples"
37 pair = []
38 for block in argument:
39 self.add_block(block)
40
41 def add_block(self,blocks):
42 blocks = blocks.split(":")
43 as_treatment = blocks[0]
44 blocks = blocks[1:]
45
46 used_samples = []
47 indexed_samples = {}
48
49 for i in range(len(blocks)):
50 block = blocks[i]
51 samples = self.get_samples_from_block(block)
52 indexed_samples[i+1] = []
53 for sample in samples:
54 if(sample in used_samples):
55 sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n")
56 sys.exit(0)
57 else:
58 indexed_samples[i+1] = block
59 used_samples.append(sample)
60
61 for sample in self.samples:
62 if(sample not in used_samples):
63 i = i + 1
64 indexed_samples[i+1] = str(sample).encode('base64').strip()
65
66 for index in indexed_samples.keys():
67 key = str(index).encode('base64').strip()
68 as_treatment += ":"+key+":"+indexed_samples[index]
69
70 self.add_treatment(as_treatment)
71
72 def get_samples_from_block(self,decoded_block):
73 return [ self.do_decode(x) for x in decoded_block.split(",")]
74
75 def add_treatments(self,argument):
76 print " - Adding treatments"
77 for treatment in argument:
78 self.add_treatment(treatment)
79
80 def add_treatment(self,treatment_argument):
81 print " - Parsing treatment"
82
83
84 treatment_argument = treatment_argument.split(":")
85 name = self.do_decode(treatment_argument[0])
86 treatment_argument = treatment_argument[1:]
87
88
89 treatment = {"factor_index":{},"sample_index":{}}
90 only_integers = True
91
92 i = 1
93 for item in treatment_argument:
94 if(i % 2):
95 factor = self.do_decode(item)
96
97 if(treatment['factor_index'].has_key(factor)):
98 sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n")
99 sys.exit(0)
100 else:
101 print " - Adding factor: "+factor
102 treatment["factor_index"][factor] = []
103 if(not factor.isdigit()):
104 only_integers = False
105 else:
106 for sample in item.split(","):
107 sample = self.do_decode(sample)
108
109 if(not sample in self.samples):
110 sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n")
111 sys.exit(0)
112
113 treatment["factor_index"][factor].append(sample)
114 if(treatment["sample_index"].has_key(sample)):
115 sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n")
116 sys.exit(0)
117 else:
118 treatment["sample_index"][sample] = factor
119 i += 1
120
121 treatment_factors = sorted(treatment["factor_index"].keys())
122
123 if(name == None):
124 treatment["name"] = "_vs_".join(treatment_factors)
125 else:
126 treatment["name"] = str(name)
127
128 if(len(treatment["sample_index"]) != len(self.samples)):
129 sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n")
130
131 if(only_integers):
132 treatment_type = "integer"
133 else:
134 treatment_type = "string"
135
136 if(self.treatments.has_key(treatment["name"])):
137 sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n")
138 else:
139 self.treatments[treatment["name"]] = treatment
140 self.treatment_index.append(treatment["name"])
141 self.treatment_types[treatment["name"]] = treatment_type
142 print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid"
143
144 def export(self,output):
145 # Open file stream
146 if(args.output == "-"):
147 fh = sys.stdout
148 else:
149 fh = open(args.output,"w")
150
151 # Write header:
152 fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n")
153
154 # Write body:
155 for sample in self.samples:
156 fh.write(sample)
157 for treatment_id in self.treatment_index:
158 treatment = self.treatments[treatment_id]
159 fh.write("\t"+treatment["sample_index"][sample])
160 fh.write("\n")
161
162 fh.close()
163
164 if __name__=="__main__":
165 parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.")
166 parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True)
167 parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)')
168 parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)')
169 parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True)
170 parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"')
171
172 args = parser.parse_args()
173
174 columns = None
175 if(args.columns_file):
176 with open(args.columns_file, "r") as f:
177 listed_columns = [None] + f.readline().strip("\n").split("\t")
178 for i in range(1,len(listed_columns)):
179 listed_columns[i] = listed_columns[i].encode('base64').replace('\n','')
180
181 s = sampleContainer()
182
183 if(listed_columns):
184 columns = []
185 for sample in args.sample_names:
186 columns.append(listed_columns[int(sample)])
187
188
189 treatments = []
190 for treatment in args.treatments:
191 treatment = treatment.split(":")
192 for i in range(1,len(treatment)):
193 if(i%2 == 0):
194 treatment_tmp = treatment[i].split(",")
195 for j in range(len(treatment_tmp)):
196 treatment_tmp[j] = listed_columns[int(treatment_tmp[j])]
197 treatment[i] = ",".join(treatment_tmp)
198
199 treatments.append(":".join(treatment))
200
201 blockings = []
202 if(args.blocking):
203 for blocking in args.blocking:
204 blocking = blocking.split(":")
205 for i in range(1,len(blocking)):
206 block = blocking[i].split(",")
207 for j in range(len(block)):
208 block[j] = listed_columns[int(block[j])]
209 blocking[i] = ",".join(block)
210 blockings.append(":".join(blocking))
211
212 s.add_samples(columns)
213 s.add_treatments(treatments)
214 s.add_blocking(blockings)
215
216 else:
217 s.add_samples(args.sample_names)
218 s.add_treatments(args.treatments)
219 if(args.blocking):
220 s.add_blocking(args.blocking)
221
222 s.export(args.output)