Mercurial > repos > yhoogstrate > edger_with_design_matrix
comparison bin/design_matrix_creator @ 1:a4a4c88783ea draft
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/edger_with_design_matrix commit 2700e500a4fb135a20ede7d52221a9d31f1aaa5e-dirty
author | yhoogstrate |
---|---|
date | Tue, 01 Sep 2015 04:59:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:acb56603b268 | 1:a4a4c88783ea |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse, os, shutil, sys, tempfile, subprocess | |
4 | |
5 __version_info__ = ('1', '0', '0')#, 'beta') | |
6 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3] | |
7 __author__ = 'Youri Hoogstrate' | |
8 __homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools' | |
9 __license__ = 'GNU General Public License v3 (GPLv3)' | |
10 | |
11 | |
12 class sampleContainer: | |
13 def __init__(self): | |
14 self.samples = [] | |
15 self.treatments = {} | |
16 self.treatment_index = [] | |
17 self.treatment_types = {} | |
18 | |
19 def do_decode(self,encoded_str): | |
20 return encoded_str.decode("base64").strip().replace("\t",'') | |
21 | |
22 def add_samples(self,argument): | |
23 print " - Adding samples" | |
24 for sample in argument: | |
25 self.add_sample(self.do_decode(sample)) | |
26 | |
27 def add_sample(self,sample): | |
28 if(sample in self.samples): | |
29 sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") | |
30 sys.exit(1) | |
31 else: | |
32 self.samples.append(sample) | |
33 print " - Added: "+sample | |
34 | |
35 def add_blocking(self,argument): | |
36 print " - Adding paired samples" | |
37 pair = [] | |
38 for block in argument: | |
39 self.add_block(block) | |
40 | |
41 def add_block(self,blocks): | |
42 blocks = blocks.split(":") | |
43 as_treatment = blocks[0] | |
44 blocks = blocks[1:] | |
45 | |
46 used_samples = [] | |
47 indexed_samples = {} | |
48 | |
49 for i in range(len(blocks)): | |
50 block = blocks[i] | |
51 samples = self.get_samples_from_block(block) | |
52 indexed_samples[i+1] = [] | |
53 for sample in samples: | |
54 if(sample in used_samples): | |
55 sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") | |
56 sys.exit(0) | |
57 else: | |
58 indexed_samples[i+1] = block | |
59 used_samples.append(sample) | |
60 | |
61 for sample in self.samples: | |
62 if(sample not in used_samples): | |
63 i = i + 1 | |
64 indexed_samples[i+1] = str(sample).encode('base64').strip() | |
65 | |
66 for index in indexed_samples.keys(): | |
67 key = str(index).encode('base64').strip() | |
68 as_treatment += ":"+key+":"+indexed_samples[index] | |
69 | |
70 self.add_treatment(as_treatment) | |
71 | |
72 def get_samples_from_block(self,decoded_block): | |
73 return [ self.do_decode(x) for x in decoded_block.split(",")] | |
74 | |
75 def add_treatments(self,argument): | |
76 print " - Adding treatments" | |
77 for treatment in argument: | |
78 self.add_treatment(treatment) | |
79 | |
80 def add_treatment(self,treatment_argument): | |
81 print " - Parsing treatment" | |
82 | |
83 | |
84 treatment_argument = treatment_argument.split(":") | |
85 name = self.do_decode(treatment_argument[0]) | |
86 treatment_argument = treatment_argument[1:] | |
87 | |
88 | |
89 treatment = {"factor_index":{},"sample_index":{}} | |
90 only_integers = True | |
91 | |
92 i = 1 | |
93 for item in treatment_argument: | |
94 if(i % 2): | |
95 factor = self.do_decode(item) | |
96 | |
97 if(treatment['factor_index'].has_key(factor)): | |
98 sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") | |
99 sys.exit(0) | |
100 else: | |
101 print " - Adding factor: "+factor | |
102 treatment["factor_index"][factor] = [] | |
103 if(not factor.isdigit()): | |
104 only_integers = False | |
105 else: | |
106 for sample in item.split(","): | |
107 sample = self.do_decode(sample) | |
108 | |
109 if(not sample in self.samples): | |
110 sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") | |
111 sys.exit(0) | |
112 | |
113 treatment["factor_index"][factor].append(sample) | |
114 if(treatment["sample_index"].has_key(sample)): | |
115 sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") | |
116 sys.exit(0) | |
117 else: | |
118 treatment["sample_index"][sample] = factor | |
119 i += 1 | |
120 | |
121 treatment_factors = sorted(treatment["factor_index"].keys()) | |
122 | |
123 if(name == None): | |
124 treatment["name"] = "_vs_".join(treatment_factors) | |
125 else: | |
126 treatment["name"] = str(name) | |
127 | |
128 if(len(treatment["sample_index"]) != len(self.samples)): | |
129 sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") | |
130 | |
131 if(only_integers): | |
132 treatment_type = "integer" | |
133 else: | |
134 treatment_type = "string" | |
135 | |
136 if(self.treatments.has_key(treatment["name"])): | |
137 sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") | |
138 else: | |
139 self.treatments[treatment["name"]] = treatment | |
140 self.treatment_index.append(treatment["name"]) | |
141 self.treatment_types[treatment["name"]] = treatment_type | |
142 print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" | |
143 | |
144 def export(self,output): | |
145 # Open file stream | |
146 if(args.output == "-"): | |
147 fh = sys.stdout | |
148 else: | |
149 fh = open(args.output,"w") | |
150 | |
151 # Write header: | |
152 fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n") | |
153 | |
154 # Write body: | |
155 for sample in self.samples: | |
156 fh.write(sample) | |
157 for treatment_id in self.treatment_index: | |
158 treatment = self.treatments[treatment_id] | |
159 fh.write("\t"+treatment["sample_index"][sample]) | |
160 fh.write("\n") | |
161 | |
162 fh.close() | |
163 | |
164 if __name__=="__main__": | |
165 parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") | |
166 parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) | |
167 parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') | |
168 parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') | |
169 parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) | |
170 parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') | |
171 | |
172 args = parser.parse_args() | |
173 | |
174 columns = None | |
175 if(args.columns_file): | |
176 with open(args.columns_file, "r") as f: | |
177 listed_columns = [None] + f.readline().strip("\n").split("\t") | |
178 for i in range(1,len(listed_columns)): | |
179 listed_columns[i] = listed_columns[i].encode('base64').replace('\n','') | |
180 | |
181 s = sampleContainer() | |
182 | |
183 if(listed_columns): | |
184 columns = [] | |
185 for sample in args.sample_names: | |
186 columns.append(listed_columns[int(sample)]) | |
187 | |
188 | |
189 treatments = [] | |
190 for treatment in args.treatments: | |
191 treatment = treatment.split(":") | |
192 for i in range(1,len(treatment)): | |
193 if(i%2 == 0): | |
194 treatment_tmp = treatment[i].split(",") | |
195 for j in range(len(treatment_tmp)): | |
196 treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] | |
197 treatment[i] = ",".join(treatment_tmp) | |
198 | |
199 treatments.append(":".join(treatment)) | |
200 | |
201 blockings = [] | |
202 if(args.blocking): | |
203 for blocking in args.blocking: | |
204 blocking = blocking.split(":") | |
205 for i in range(1,len(blocking)): | |
206 block = blocking[i].split(",") | |
207 for j in range(len(block)): | |
208 block[j] = listed_columns[int(block[j])] | |
209 blocking[i] = ",".join(block) | |
210 blockings.append(":".join(blocking)) | |
211 | |
212 s.add_samples(columns) | |
213 s.add_treatments(treatments) | |
214 s.add_blocking(blockings) | |
215 | |
216 else: | |
217 s.add_samples(args.sample_names) | |
218 s.add_treatments(args.treatments) | |
219 if(args.blocking): | |
220 s.add_blocking(args.blocking) | |
221 | |
222 s.export(args.output) |