Mercurial > repos > cmonjeau > stacks
comparison stacks.py @ 0:d6ba40f6c824
first commit
author | cmonjeau |
---|---|
date | Mon, 24 Aug 2015 09:29:12 +0000 |
parents | |
children | 0e0ff9e9c761 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d6ba40f6c824 |
---|---|
1 """ | |
2 | |
3 STACKS METHODS FOR GALAXY | |
4 | |
5 Created by Cyril Monjeaud & Yvan Le Bras | |
6 Cyril.Monjeaud@irisa.fr | |
7 yvan.le_bras@irisa.fr | |
8 | |
9 Last modifications : 01/22/2014 | |
10 | |
11 | |
12 """ | |
13 | |
14 import os, sys, re, shutil | |
15 import glob | |
16 import collections | |
17 import gzip, zipfile, tarfile | |
18 import subprocess | |
19 from galaxy.datatypes.checkers import * | |
20 | |
21 | |
22 """ | |
23 | |
24 STACKS COMMON METHODS | |
25 | |
26 galaxy_config_to_tabfiles(input_config) | |
27 galaxy_config_to_tabfiles_for_STACKS(input_config) | |
28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir) | |
29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir) | |
30 | |
31 """ | |
32 def galaxy_config_to_tabfiles(input_config): | |
33 | |
34 tab_files=collections.OrderedDict() | |
35 for line in open(input_config, "r").readlines(): | |
36 if line.strip() != '': | |
37 extract=line.strip().split("::") | |
38 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] | |
39 | |
40 # tabfiles[name]-> path | |
41 return tab_files | |
42 | |
43 | |
44 def galaxy_config_to_tabfiles_for_STACKS(input_config): | |
45 | |
46 tab_files=collections.OrderedDict() | |
47 for line in open(input_config, "r").readlines(): | |
48 if line.strip() != '': | |
49 extract=line.strip().split("::") | |
50 parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0]) | |
51 # rename galaxy name in a short name | |
52 if parse_name: | |
53 extract[0]=parse_name.groups(1)[0] | |
54 | |
55 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] | |
56 | |
57 # tabfiles[name]-> path | |
58 return tab_files | |
59 | |
60 | |
61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir): | |
62 | |
63 # for each file | |
64 for key in tab_files.keys(): | |
65 #test if is zip file | |
66 if (check_zip( tab_files[key] )): | |
67 | |
68 # extract all files names and added it in the tab | |
69 myarchive = zipfile.ZipFile(tab_files[key], 'r') | |
70 for i in myarchive.namelist(): | |
71 tab_files[i]=tmp_input_dir+"/"+i | |
72 | |
73 # extract all files | |
74 myarchive.extractall(tmp_input_dir) | |
75 | |
76 #remove compress file from the tab | |
77 del tab_files[key] | |
78 | |
79 #test if is tar.gz file | |
80 else: | |
81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ): | |
82 # extract all files names and added it in the tab | |
83 mygzfile = tarfile.open(tab_files[key], 'r') | |
84 | |
85 for i in mygzfile.getnames(): | |
86 tab_files[i]=tmp_input_dir+"/"+i | |
87 | |
88 # extract all files | |
89 mygzfile.extractall(tmp_input_dir) | |
90 | |
91 #remove compress file from the tab | |
92 del tab_files[key] | |
93 | |
94 | |
95 | |
96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir): | |
97 | |
98 for key in tab_files.keys(): | |
99 #print "file single: "+key+" -> "+tab_files[key] | |
100 #create a sym_link in our temp dir | |
101 if not os.path.exists(tmp_input_dir+'/'+key): | |
102 os.symlink(tab_files[key], tmp_input_dir+'/'+key) | |
103 | |
104 | |
105 """ | |
106 | |
107 PROCESS RADTAGS METHODS | |
108 | |
109 generate_additional_file(tmp_output_dir, output_archive) | |
110 | |
111 """ | |
112 | |
113 def change_outputs_procrad_name(tmp_output_dir, sample_name): | |
114 | |
115 list_files = glob.glob(tmp_output_dir+'/*') | |
116 | |
117 for file in list_files: | |
118 # change sample name | |
119 new_file_name=os.path.basename(file.replace("_",".").replace("sample", sample_name)) | |
120 | |
121 # transform .fa -> .fasta or .fq->.fastq | |
122 if os.path.splitext(new_file_name)[1] == ".fa": | |
123 new_file_name = os.path.splitext(new_file_name)[0]+'.fasta' | |
124 else: | |
125 new_file_name = os.path.splitext(new_file_name)[0]+'.fastq' | |
126 | |
127 shutil.move(tmp_output_dir+'/'+os.path.basename(file), tmp_output_dir+'/'+new_file_name) | |
128 | |
129 | |
130 def generate_additional_archive_file(tmp_output_dir, output_archive): | |
131 | |
132 list_files = glob.glob(tmp_output_dir+'/*') | |
133 | |
134 myzip=zipfile.ZipFile("archive.zip.temp", 'w', allowZip64=True) | |
135 | |
136 # for each fastq file | |
137 for fastq_file in list_files: | |
138 # add file to the archive output | |
139 myzip.write(fastq_file, os.path.basename(fastq_file)) | |
140 | |
141 shutil.move("archive.zip.temp", output_archive) | |
142 | |
143 | |
144 """ | |
145 | |
146 DENOVOMAP METHODS | |
147 | |
148 check_fastq_extension_and_add(tab_files, tmp_input_dir) | |
149 | |
150 """ | |
151 | |
152 def check_fastq_extension_and_add(tab_files, tmp_input_dir): | |
153 | |
154 # for each file | |
155 for key in tab_files.keys(): | |
156 | |
157 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key): | |
158 # open the file | |
159 myfastxfile=open(tab_files[key], 'r') | |
160 | |
161 # get the header | |
162 line = myfastxfile.readline() | |
163 line = line.strip() | |
164 | |
165 # fasta rapid test | |
166 if line.startswith( '>' ): | |
167 tab_files[key+".fasta"]=tab_files[key] | |
168 del tab_files[key] | |
169 # fastq rapid test | |
170 elif line.startswith( '@' ): | |
171 tab_files[key+".fq"]=tab_files[key] | |
172 del tab_files[key] | |
173 else: | |
174 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file" | |
175 | |
176 myfastxfile.close() | |
177 | |
178 | |
179 """ | |
180 | |
181 REFMAP METHODS | |
182 | |
183 """ | |
184 | |
185 def check_sam_extension_and_add(tab_files, tmp_input_dir): | |
186 | |
187 # for each file | |
188 for key in tab_files.keys(): | |
189 | |
190 if not re.search("\.sam$", key): | |
191 # add the extension | |
192 tab_files[key+".sam"]=tab_files[key] | |
193 del tab_files[key] | |
194 | |
195 | |
196 | |
197 | |
198 | |
199 | |
200 """ | |
201 | |
202 PREPARE POPULATION MAP METHODS | |
203 | |
204 generate_popmap_for_denovo(tab_files, infos_file, pop_map) | |
205 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map) | |
206 | |
207 | |
208 """ | |
209 def generate_popmap_for_denovo(tab_files, infos_file, pop_map): | |
210 | |
211 # initiate the dict : barcode -> tab[seq] | |
212 fq_name_for_barcode={} | |
213 | |
214 for key in tab_files: | |
215 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", key).groups(0)[0] | |
216 fq_name_for_barcode[single_barcode]=key | |
217 | |
218 # open the infos file and output file | |
219 my_open_info_file=open(infos_file, 'r') | |
220 my_output_file=open(pop_map, 'w') | |
221 | |
222 # conversion tab for population to integer | |
223 pop_to_int=[] | |
224 | |
225 # write infos into the final output | |
226 for line in my_open_info_file: | |
227 | |
228 parse_line=re.search("(^[ATCG]+)\t(.*)", line.strip()) | |
229 | |
230 if not parse_line: | |
231 print "[WARNING] Wrong input infos file structure : "+line | |
232 else: | |
233 barcode=parse_line.groups(1)[0] | |
234 population_name=parse_line.groups(1)[1] | |
235 | |
236 # if its the first meet with the population | |
237 if population_name not in pop_to_int: | |
238 pop_to_int.append(population_name) | |
239 | |
240 # manage ext if present, because the population map file should not have the ext | |
241 if re.search("(\.fq$|\.fastq$)", fq_name_for_barcode[barcode]): | |
242 fqfile=os.path.splitext(fq_name_for_barcode[barcode])[0] | |
243 else: | |
244 fqfile=fq_name_for_barcode[barcode] | |
245 | |
246 | |
247 # write in the file | |
248 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(population_name))+"\n") | |
249 | |
250 # close files | |
251 my_output_file.close() | |
252 my_open_info_file.close() | |
253 | |
254 | |
255 | |
256 | |
257 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map): | |
258 | |
259 # initiate the dict : barcode -> tab[seq] | |
260 seq_id_for_barcode={} | |
261 | |
262 # initiate the dict : barcode -> sam_name | |
263 sam_name_for_barcode={} | |
264 | |
265 ### Parse fastqfiles ### | |
266 # insert my barcode into a tab with sequences ID associated | |
267 for fastq_file in tab_fq_files.keys(): | |
268 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", fastq_file).groups(0)[0] | |
269 | |
270 # open the fasq file | |
271 open_fastqfile=open(tab_fq_files[fastq_file], 'r') | |
272 | |
273 # for each line, get the seq ID | |
274 tab_seq_id=[] | |
275 for line in open_fastqfile: | |
276 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line) | |
277 if my_match_seqID: | |
278 tab_seq_id.append(my_match_seqID.groups(0)[0]) | |
279 | |
280 # push in a dict the tab of seqID for the current barcode | |
281 seq_id_for_barcode[single_barcode]=tab_seq_id | |
282 | |
283 | |
284 ### Parse samfiles and get the first seq id ### | |
285 for sam_file in tab_sam_files.keys(): | |
286 | |
287 # open the sam file | |
288 open_samfile=open(tab_sam_files[sam_file], 'r') | |
289 | |
290 # get the first seq id | |
291 first_seq_id='' | |
292 for line in open_samfile: | |
293 if not re.search("^@", line): | |
294 first_seq_id=line.split("\t")[0] | |
295 break | |
296 | |
297 | |
298 # map with seq_id_for_barcode structure | |
299 for barcode in seq_id_for_barcode: | |
300 for seq in seq_id_for_barcode[barcode]: | |
301 if seq == first_seq_id: | |
302 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode | |
303 sam_name_for_barcode[barcode]=sam_file | |
304 break | |
305 | |
306 # open the infos file and output file | |
307 my_open_info_file=open(infos_file, 'r') | |
308 my_output_file=open(pop_map, 'w') | |
309 | |
310 # conversion tab for population to integer | |
311 pop_to_int=[] | |
312 | |
313 # write infos into the final output | |
314 for line in my_open_info_file: | |
315 parse_line=re.search("(^[ATCG]+)\t(.*)", line) | |
316 | |
317 if not parse_line: | |
318 print "[WARNING] Wrong input infos file structure : "+line | |
319 else: | |
320 | |
321 # if its the first meet with the population | |
322 if parse_line.groups(1)[1] not in pop_to_int: | |
323 pop_to_int.append(parse_line.groups(1)[1]) | |
324 | |
325 # manage ext if present, because the population map file should not have the ext | |
326 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]): | |
327 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0] | |
328 else: | |
329 samfile=sam_name_for_barcode[parse_line.groups(1)[0]] | |
330 | |
331 # write in the file | |
332 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") | |
333 | |
334 # close files | |
335 my_output_file.close() | |
336 my_open_info_file.close() | |
337 | |
338 | |
339 """ | |
340 | |
341 STACKS POPULATION | |
342 | |
343 | |
344 """ | |
345 | |
346 | |
347 def extract_compress_files(myfile, tmp_input_dir): | |
348 | |
349 #test if is zip file | |
350 if (check_zip( myfile )): | |
351 | |
352 # extract all files names and added it in the tab | |
353 myarchive = zipfile.ZipFile(myfile, 'r') | |
354 | |
355 # extract all files | |
356 myarchive.extractall(tmp_input_dir) | |
357 | |
358 | |
359 #test if is tar.gz file | |
360 else: | |
361 # extract all files names and added it in the tab | |
362 mygzfile = tarfile.open(myfile, 'r') | |
363 | |
364 # extract all files | |
365 mygzfile.extractall(tmp_input_dir) | |
366 | |
367 |