0
|
1 """
|
|
2
|
|
3 STACKS METHODS FOR GALAXY
|
|
4
|
|
5 Created by Cyril Monjeaud & Yvan Le Bras
|
|
6 Cyril.Monjeaud@irisa.fr
|
|
7 yvan.le_bras@irisa.fr
|
|
8
|
|
9 Last modifications : 01/22/2014
|
|
10
|
|
11
|
|
12 """
|
|
13
|
|
14 import os, sys, re
|
|
15 import glob
|
|
16 import collections
|
|
17 import gzip, zipfile, tarfile
|
|
18 import subprocess
|
|
19 from galaxy.datatypes.checkers import *
|
|
20
|
|
21
|
|
22 """
|
|
23
|
|
24 STACKS COMMON METHODS
|
|
25
|
|
26 galaxy_config_to_tabfiles(input_config)
|
|
27 galaxy_config_to_tabfiles_for_STACKS(input_config)
|
|
28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir)
|
|
29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir)
|
|
30
|
|
31 """
|
|
32 def galaxy_config_to_tabfiles(input_config):
|
|
33
|
|
34 tab_files={}
|
|
35 for line in open(input_config, "r").readlines():
|
|
36 if line.strip() != '':
|
|
37 extract=line.strip().split("::")
|
|
38 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
|
|
39
|
|
40 # tabfiles[name]-> path
|
|
41 return tab_files
|
|
42
|
|
43
|
|
44 def galaxy_config_to_tabfiles_for_STACKS(input_config):
|
|
45
|
|
46 tab_files={}
|
|
47 for line in open(input_config, "r").readlines():
|
|
48 if line.strip() != '':
|
|
49 extract=line.strip().split("::")
|
|
50 parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0])
|
|
51 # rename galaxy name in a short name
|
|
52 if parse_name:
|
|
53 extract[0]=parse_name.groups(1)[0]
|
|
54
|
|
55 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
|
|
56
|
|
57 # tabfiles[name]-> path
|
|
58 return tab_files
|
|
59
|
|
60
|
|
61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir):
|
|
62
|
|
63 # for each file
|
|
64 for key in tab_files.keys():
|
|
65 #test if is zip file
|
|
66 if (check_zip( tab_files[key] )):
|
|
67
|
|
68 # extract all files names and added it in the tab
|
|
69 myarchive = zipfile.ZipFile(tab_files[key], 'r')
|
|
70 for i in myarchive.namelist():
|
|
71 tab_files[i]=tmp_input_dir+"/"+i
|
|
72
|
|
73 # extract all files
|
|
74 myarchive.extractall(tmp_input_dir)
|
|
75
|
|
76 #remove compress file from the tab
|
|
77 del tab_files[key]
|
|
78
|
|
79 #test if is tar.gz file
|
|
80 else:
|
|
81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ):
|
|
82 # extract all files names and added it in the tab
|
|
83 mygzfile = tarfile.open(tab_files[key], 'r')
|
|
84
|
|
85 for i in mygzfile.getnames():
|
|
86 tab_files[i]=tmp_input_dir+"/"+i
|
|
87
|
|
88 # extract all files
|
|
89 mygzfile.extractall(tmp_input_dir)
|
|
90
|
|
91 #remove compress file from the tab
|
|
92 del tab_files[key]
|
|
93
|
|
94
|
|
95
|
|
96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir):
|
|
97
|
|
98 for key in tab_files.keys():
|
|
99 #print "file single: "+key+" -> "+tab_files[key]
|
|
100 #create a sym_link in our temp dir
|
|
101 if not os.path.exists(tmp_input_dir+'/'+key):
|
|
102 cmd = 'ln -s '+tab_files[key]+' '+tmp_input_dir+'/'+key
|
|
103 proc = subprocess.Popen( args=cmd, shell=True )
|
|
104 returncode = proc.wait()
|
|
105
|
|
106
|
|
107
|
|
108 """
|
|
109
|
|
110 PROCESS RADTAGS METHODS
|
|
111
|
|
112 generate_additional_file(tmp_output_dir, output_archive)
|
|
113
|
|
114 """
|
|
115
|
|
116 def change_outputs_procrad_name(tmp_output_dir, sample_name):
|
|
117
|
|
118 list_files = glob.glob(tmp_output_dir+'/*')
|
|
119 for fastq_file in list_files:
|
|
120 os.chdir(tmp_output_dir)
|
|
121 new_fastq_name=os.path.basename(fastq_file.replace("_",".").replace("sample", sample_name))
|
|
122 os.system('mv '+os.path.basename(fastq_file)+' '+new_fastq_name)
|
|
123
|
|
124
|
|
125
|
|
126
|
|
127 def generate_additional_archive_file(tmp_output_dir, output_archive):
|
|
128
|
|
129 list_files = glob.glob(tmp_output_dir+'/*')
|
|
130 myzip=zipfile.ZipFile(output_archive, 'w')
|
|
131
|
|
132 # for each fastq file
|
|
133 for fastq_file in list_files:
|
|
134
|
|
135 # add file to the archive output
|
|
136 os.chdir(tmp_output_dir)
|
|
137 myzip.write(os.path.basename(fastq_file))
|
|
138
|
|
139
|
|
140 """
|
|
141
|
|
142 DENOVOMAP METHODS
|
|
143
|
|
144 check_fastq_extension_and_add(tab_files, tmp_input_dir)
|
|
145
|
|
146 """
|
|
147
|
|
148 def check_fastq_extension_and_add(tab_files, tmp_input_dir):
|
|
149
|
|
150 # for each file
|
|
151 for key in tab_files.keys():
|
|
152
|
|
153 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key):
|
|
154 # open the file
|
|
155 myfastxfile=open(tab_files[key], 'r')
|
|
156
|
|
157 # get the header
|
|
158 line = myfastxfile.readline()
|
|
159 line = line.strip()
|
|
160
|
|
161 # fasta rapid test
|
|
162 if line.startswith( '>' ):
|
|
163 tab_files[key+".fasta"]=tab_files[key]
|
|
164 del tab_files[key]
|
|
165 # fastq rapid test
|
|
166 elif line.startswith( '@' ):
|
|
167 tab_files[key+".fq"]=tab_files[key]
|
|
168 del tab_files[key]
|
|
169 else:
|
|
170 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file"
|
|
171
|
|
172 myfastxfile.close()
|
|
173
|
|
174
|
|
175 """
|
|
176
|
|
177 REFMAP METHODS
|
|
178
|
|
179 """
|
|
180
|
|
181 def check_sam_extension_and_add(tab_files, tmp_input_dir):
|
|
182
|
|
183 # for each file
|
|
184 for key in tab_files.keys():
|
|
185
|
|
186 if not re.search("\.sam$", key):
|
|
187 # add the extension
|
|
188 tab_files[key+".sam"]=tab_files[key]
|
|
189 del tab_files[key]
|
|
190
|
|
191
|
|
192
|
|
193
|
|
194
|
|
195
|
|
196 """
|
|
197
|
|
198 PREPARE POPULATION MAP METHODS
|
|
199
|
|
200 generate_popmap_for_denovo(tab_files, infos_file, pop_map)
|
|
201 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map)
|
|
202
|
|
203
|
|
204 """
|
|
205 def generate_popmap_for_denovo(tab_files, infos_file, pop_map):
|
|
206
|
|
207 # initiate the dict : barcode -> tab[seq]
|
|
208 fq_name_for_barcode={}
|
|
209
|
|
210 for key in tab_files:
|
|
211 single_barcode=re.search("([ATCG]*)\.fq", key).groups(0)[0]
|
|
212 fq_name_for_barcode[single_barcode]=key
|
|
213
|
|
214 # open the infos file and output file
|
|
215 my_open_info_file=open(infos_file, 'r')
|
|
216 my_output_file=open(pop_map, 'w')
|
|
217
|
|
218 # conversion tab for population to integer
|
|
219 pop_to_int=[]
|
|
220
|
|
221 # write infos into the final output
|
|
222 for line in my_open_info_file:
|
|
223 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line)
|
|
224
|
|
225 # if its the first meet with the population
|
|
226 if parse_line.groups(1)[1] not in pop_to_int:
|
|
227 pop_to_int.append(parse_line.groups(1)[1])
|
|
228
|
|
229 # manage ext if present, because the population map file should not have the ext
|
|
230 if re.search("\.fq", fq_name_for_barcode[parse_line.groups(1)[0]]) or re.search("\.fastq", sam_name_for_barcode[parse_line.groups(1)[0]]):
|
|
231 fqfile=os.path.splitext(fq_name_for_barcode[parse_line.groups(1)[0]])[0]
|
|
232 else:
|
|
233 fqfile=fq_name_for_barcode[parse_line.groups(1)[0]]
|
|
234
|
|
235
|
|
236 # write in the file
|
|
237 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")
|
|
238
|
|
239 # close files
|
|
240 my_output_file.close()
|
|
241 my_open_info_file.close()
|
|
242
|
|
243
|
|
244
|
|
245
|
|
246 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map):
|
|
247
|
|
248 # initiate the dict : barcode -> tab[seq]
|
|
249 seq_id_for_barcode={}
|
|
250
|
|
251 # initiate the dict : barcode -> sam_name
|
|
252 sam_name_for_barcode={}
|
|
253
|
|
254 ### Parse fastqfiles ###
|
|
255 # insert my barcode into a tab with sequences ID associated
|
|
256 for fastq_file in tab_fq_files.keys():
|
|
257 single_barcode=re.search("([ATCG]*)\.fq", fastq_file).groups(0)[0]
|
|
258
|
|
259 # open the fasq file
|
|
260 open_fastqfile=open(tab_fq_files[fastq_file], 'r')
|
|
261
|
|
262 # for each line, get the seq ID
|
|
263 tab_seq_id=[]
|
|
264 for line in open_fastqfile:
|
|
265 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line)
|
|
266 if my_match_seqID:
|
|
267 tab_seq_id.append(my_match_seqID.groups(0)[0])
|
|
268
|
|
269 # push in a dict the tab of seqID for the current barcode
|
|
270 seq_id_for_barcode[single_barcode]=tab_seq_id
|
|
271
|
|
272
|
|
273 ### Parse samfiles and get the first seq id ###
|
|
274 for sam_file in tab_sam_files.keys():
|
|
275
|
|
276 # open the sam file
|
|
277 open_samfile=open(tab_sam_files[sam_file], 'r')
|
|
278
|
|
279 # get the first seq id
|
|
280 first_seq_id=''
|
|
281 for line in open_samfile:
|
|
282 if not re.search("^@", line):
|
|
283 first_seq_id=line.split("\t")[0]
|
|
284 break
|
|
285
|
|
286
|
|
287 # map with seq_id_for_barcode structure
|
|
288 for barcode in seq_id_for_barcode:
|
|
289 for seq in seq_id_for_barcode[barcode]:
|
|
290 if seq == first_seq_id:
|
|
291 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode
|
|
292 sam_name_for_barcode[barcode]=sam_file
|
|
293 break
|
|
294
|
|
295 # open the infos file and output file
|
|
296 my_open_info_file=open(infos_file, 'r')
|
|
297 my_output_file=open(pop_map, 'w')
|
|
298
|
|
299 # conversion tab for population to integer
|
|
300 pop_to_int=[]
|
|
301
|
|
302 # write infos into the final output
|
|
303 for line in my_open_info_file:
|
|
304 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line)
|
|
305
|
|
306 # if its the first meet with the population
|
|
307 if parse_line.groups(1)[1] not in pop_to_int:
|
|
308 pop_to_int.append(parse_line.groups(1)[1])
|
|
309
|
|
310 # manage ext if present, because the population map file should not have the ext
|
|
311 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]):
|
|
312 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0]
|
|
313 else:
|
|
314 samfile=sam_name_for_barcode[parse_line.groups(1)[0]]
|
|
315
|
|
316 # write in the file
|
|
317 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")
|
|
318
|
|
319 # close files
|
|
320 my_output_file.close()
|
|
321 my_open_info_file.close()
|
|
322
|
|
323
|
|
324 """
|
|
325
|
|
326 STACKS POPULATION
|
|
327
|
|
328
|
|
329 """
|
|
330
|
|
331
|
|
332 def extract_compress_files(myfile, tmp_input_dir):
|
|
333
|
|
334 #test if is zip file
|
|
335 if (check_zip( myfile )):
|
|
336
|
|
337 # extract all files names and added it in the tab
|
|
338 myarchive = zipfile.ZipFile(myfile, 'r')
|
|
339
|
|
340 # extract all files
|
|
341 myarchive.extractall(tmp_input_dir)
|
|
342
|
|
343
|
|
344 #test if is tar.gz file
|
|
345 else:
|
|
346 # extract all files names and added it in the tab
|
|
347 mygzfile = tarfile.open(myfile, 'r')
|
|
348
|
|
349 # extract all files
|
|
350 mygzfile.extractall(tmp_input_dir)
|
|
351
|
|
352
|
|
353
|
|
354
|
|
355
|