comparison stacks.py @ 0:d6ba40f6c824

first commit
author cmonjeau
date Mon, 24 Aug 2015 09:29:12 +0000
parents
children 0e0ff9e9c761
comparison
equal deleted inserted replaced
-1:000000000000 0:d6ba40f6c824
1 """
2
3 STACKS METHODS FOR GALAXY
4
5 Created by Cyril Monjeaud & Yvan Le Bras
6 Cyril.Monjeaud@irisa.fr
7 yvan.le_bras@irisa.fr
8
9 Last modifications : 01/22/2014
10
11
12 """
13
14 import os, sys, re, shutil
15 import glob
16 import collections
17 import gzip, zipfile, tarfile
18 import subprocess
19 from galaxy.datatypes.checkers import *
20
21
22 """
23
24 STACKS COMMON METHODS
25
26 galaxy_config_to_tabfiles(input_config)
27 galaxy_config_to_tabfiles_for_STACKS(input_config)
28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir)
29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir)
30
31 """
32 def galaxy_config_to_tabfiles(input_config):
33
34 tab_files=collections.OrderedDict()
35 for line in open(input_config, "r").readlines():
36 if line.strip() != '':
37 extract=line.strip().split("::")
38 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
39
40 # tabfiles[name]-> path
41 return tab_files
42
43
44 def galaxy_config_to_tabfiles_for_STACKS(input_config):
45
46 tab_files=collections.OrderedDict()
47 for line in open(input_config, "r").readlines():
48 if line.strip() != '':
49 extract=line.strip().split("::")
50 parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0])
51 # rename galaxy name in a short name
52 if parse_name:
53 extract[0]=parse_name.groups(1)[0]
54
55 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
56
57 # tabfiles[name]-> path
58 return tab_files
59
60
61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir):
62
63 # for each file
64 for key in tab_files.keys():
65 #test if is zip file
66 if (check_zip( tab_files[key] )):
67
68 # extract all files names and added it in the tab
69 myarchive = zipfile.ZipFile(tab_files[key], 'r')
70 for i in myarchive.namelist():
71 tab_files[i]=tmp_input_dir+"/"+i
72
73 # extract all files
74 myarchive.extractall(tmp_input_dir)
75
76 #remove compress file from the tab
77 del tab_files[key]
78
79 #test if is tar.gz file
80 else:
81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ):
82 # extract all files names and added it in the tab
83 mygzfile = tarfile.open(tab_files[key], 'r')
84
85 for i in mygzfile.getnames():
86 tab_files[i]=tmp_input_dir+"/"+i
87
88 # extract all files
89 mygzfile.extractall(tmp_input_dir)
90
91 #remove compress file from the tab
92 del tab_files[key]
93
94
95
96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir):
97
98 for key in tab_files.keys():
99 #print "file single: "+key+" -> "+tab_files[key]
100 #create a sym_link in our temp dir
101 if not os.path.exists(tmp_input_dir+'/'+key):
102 os.symlink(tab_files[key], tmp_input_dir+'/'+key)
103
104
105 """
106
107 PROCESS RADTAGS METHODS
108
109 generate_additional_file(tmp_output_dir, output_archive)
110
111 """
112
113 def change_outputs_procrad_name(tmp_output_dir, sample_name):
114
115 list_files = glob.glob(tmp_output_dir+'/*')
116
117 for file in list_files:
118 # change sample name
119 new_file_name=os.path.basename(file.replace("_",".").replace("sample", sample_name))
120
121 # transform .fa -> .fasta or .fq->.fastq
122 if os.path.splitext(new_file_name)[1] == ".fa":
123 new_file_name = os.path.splitext(new_file_name)[0]+'.fasta'
124 else:
125 new_file_name = os.path.splitext(new_file_name)[0]+'.fastq'
126
127 shutil.move(tmp_output_dir+'/'+os.path.basename(file), tmp_output_dir+'/'+new_file_name)
128
129
130 def generate_additional_archive_file(tmp_output_dir, output_archive):
131
132 list_files = glob.glob(tmp_output_dir+'/*')
133
134 myzip=zipfile.ZipFile("archive.zip.temp", 'w', allowZip64=True)
135
136 # for each fastq file
137 for fastq_file in list_files:
138 # add file to the archive output
139 myzip.write(fastq_file, os.path.basename(fastq_file))
140
141 shutil.move("archive.zip.temp", output_archive)
142
143
144 """
145
146 DENOVOMAP METHODS
147
148 check_fastq_extension_and_add(tab_files, tmp_input_dir)
149
150 """
151
152 def check_fastq_extension_and_add(tab_files, tmp_input_dir):
153
154 # for each file
155 for key in tab_files.keys():
156
157 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key):
158 # open the file
159 myfastxfile=open(tab_files[key], 'r')
160
161 # get the header
162 line = myfastxfile.readline()
163 line = line.strip()
164
165 # fasta rapid test
166 if line.startswith( '>' ):
167 tab_files[key+".fasta"]=tab_files[key]
168 del tab_files[key]
169 # fastq rapid test
170 elif line.startswith( '@' ):
171 tab_files[key+".fq"]=tab_files[key]
172 del tab_files[key]
173 else:
174 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file"
175
176 myfastxfile.close()
177
178
179 """
180
181 REFMAP METHODS
182
183 """
184
185 def check_sam_extension_and_add(tab_files, tmp_input_dir):
186
187 # for each file
188 for key in tab_files.keys():
189
190 if not re.search("\.sam$", key):
191 # add the extension
192 tab_files[key+".sam"]=tab_files[key]
193 del tab_files[key]
194
195
196
197
198
199
200 """
201
202 PREPARE POPULATION MAP METHODS
203
204 generate_popmap_for_denovo(tab_files, infos_file, pop_map)
205 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map)
206
207
208 """
209 def generate_popmap_for_denovo(tab_files, infos_file, pop_map):
210
211 # initiate the dict : barcode -> tab[seq]
212 fq_name_for_barcode={}
213
214 for key in tab_files:
215 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", key).groups(0)[0]
216 fq_name_for_barcode[single_barcode]=key
217
218 # open the infos file and output file
219 my_open_info_file=open(infos_file, 'r')
220 my_output_file=open(pop_map, 'w')
221
222 # conversion tab for population to integer
223 pop_to_int=[]
224
225 # write infos into the final output
226 for line in my_open_info_file:
227
228 parse_line=re.search("(^[ATCG]+)\t(.*)", line.strip())
229
230 if not parse_line:
231 print "[WARNING] Wrong input infos file structure : "+line
232 else:
233 barcode=parse_line.groups(1)[0]
234 population_name=parse_line.groups(1)[1]
235
236 # if its the first meet with the population
237 if population_name not in pop_to_int:
238 pop_to_int.append(population_name)
239
240 # manage ext if present, because the population map file should not have the ext
241 if re.search("(\.fq$|\.fastq$)", fq_name_for_barcode[barcode]):
242 fqfile=os.path.splitext(fq_name_for_barcode[barcode])[0]
243 else:
244 fqfile=fq_name_for_barcode[barcode]
245
246
247 # write in the file
248 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(population_name))+"\n")
249
250 # close files
251 my_output_file.close()
252 my_open_info_file.close()
253
254
255
256
257 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map):
258
259 # initiate the dict : barcode -> tab[seq]
260 seq_id_for_barcode={}
261
262 # initiate the dict : barcode -> sam_name
263 sam_name_for_barcode={}
264
265 ### Parse fastqfiles ###
266 # insert my barcode into a tab with sequences ID associated
267 for fastq_file in tab_fq_files.keys():
268 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", fastq_file).groups(0)[0]
269
270 # open the fasq file
271 open_fastqfile=open(tab_fq_files[fastq_file], 'r')
272
273 # for each line, get the seq ID
274 tab_seq_id=[]
275 for line in open_fastqfile:
276 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line)
277 if my_match_seqID:
278 tab_seq_id.append(my_match_seqID.groups(0)[0])
279
280 # push in a dict the tab of seqID for the current barcode
281 seq_id_for_barcode[single_barcode]=tab_seq_id
282
283
284 ### Parse samfiles and get the first seq id ###
285 for sam_file in tab_sam_files.keys():
286
287 # open the sam file
288 open_samfile=open(tab_sam_files[sam_file], 'r')
289
290 # get the first seq id
291 first_seq_id=''
292 for line in open_samfile:
293 if not re.search("^@", line):
294 first_seq_id=line.split("\t")[0]
295 break
296
297
298 # map with seq_id_for_barcode structure
299 for barcode in seq_id_for_barcode:
300 for seq in seq_id_for_barcode[barcode]:
301 if seq == first_seq_id:
302 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode
303 sam_name_for_barcode[barcode]=sam_file
304 break
305
306 # open the infos file and output file
307 my_open_info_file=open(infos_file, 'r')
308 my_output_file=open(pop_map, 'w')
309
310 # conversion tab for population to integer
311 pop_to_int=[]
312
313 # write infos into the final output
314 for line in my_open_info_file:
315 parse_line=re.search("(^[ATCG]+)\t(.*)", line)
316
317 if not parse_line:
318 print "[WARNING] Wrong input infos file structure : "+line
319 else:
320
321 # if its the first meet with the population
322 if parse_line.groups(1)[1] not in pop_to_int:
323 pop_to_int.append(parse_line.groups(1)[1])
324
325 # manage ext if present, because the population map file should not have the ext
326 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]):
327 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0]
328 else:
329 samfile=sam_name_for_barcode[parse_line.groups(1)[0]]
330
331 # write in the file
332 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")
333
334 # close files
335 my_output_file.close()
336 my_open_info_file.close()
337
338
339 """
340
341 STACKS POPULATION
342
343
344 """
345
346
347 def extract_compress_files(myfile, tmp_input_dir):
348
349 #test if is zip file
350 if (check_zip( myfile )):
351
352 # extract all files names and added it in the tab
353 myarchive = zipfile.ZipFile(myfile, 'r')
354
355 # extract all files
356 myarchive.extractall(tmp_input_dir)
357
358
359 #test if is tar.gz file
360 else:
361 # extract all files names and added it in the tab
362 mygzfile = tarfile.open(myfile, 'r')
363
364 # extract all files
365 mygzfile.extractall(tmp_input_dir)
366
367