0
|
1 """
|
|
2
|
|
3 STACKS METHODS FOR GALAXY
|
|
4
|
|
5 Created by Cyril Monjeaud & Yvan Le Bras
|
|
6 Cyril.Monjeaud@irisa.fr
|
|
7 yvan.le_bras@irisa.fr
|
|
8
|
|
9 Last modifications : 01/22/2014
|
|
10
|
|
11
|
|
12 """
|
|
13
|
|
14 import os, sys, re, shutil
|
|
15 import glob
|
|
16 import collections
|
|
17 import gzip, zipfile, tarfile
|
|
18 import subprocess
|
|
19 from galaxy.datatypes.checkers import *
|
|
20
|
|
21
|
|
22 """
|
|
23
|
|
24 STACKS COMMON METHODS
|
|
25
|
|
26 galaxy_config_to_tabfiles(input_config)
|
|
27 galaxy_config_to_tabfiles_for_STACKS(input_config)
|
|
28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir)
|
|
29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir)
|
|
30
|
|
31 """
|
|
32 def galaxy_config_to_tabfiles(input_config):
|
|
33
|
|
34 tab_files=collections.OrderedDict()
|
|
35 for line in open(input_config, "r").readlines():
|
|
36 if line.strip() != '':
|
|
37 extract=line.strip().split("::")
|
3
|
38 tab_files[extract[0].replace("(", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
|
0
|
39
|
|
40 # tabfiles[name]-> path
|
|
41 return tab_files
|
|
42
|
|
43
|
|
44 def galaxy_config_to_tabfiles_for_STACKS(input_config):
|
|
45
|
|
46 tab_files=collections.OrderedDict()
|
|
47 for line in open(input_config, "r").readlines():
|
|
48 if line.strip() != '':
|
|
49 extract=line.strip().split("::")
|
3
|
50 parse_name=re.search("STACKS.*\((.*\.[ATCG]*).*\)$", extract[0])
|
0
|
51 # rename galaxy name in a short name
|
|
52 if parse_name:
|
|
53 extract[0]=parse_name.groups(1)[0]
|
|
54
|
3
|
55 tab_files[extract[0].replace("(", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
|
0
|
56
|
|
57 # tabfiles[name]-> path
|
|
58 return tab_files
|
|
59
|
|
60
|
|
61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir):
|
|
62
|
|
63 # for each file
|
|
64 for key in tab_files.keys():
|
|
65 #test if is zip file
|
|
66 if (check_zip( tab_files[key] )):
|
|
67
|
|
68 # extract all files names and added it in the tab
|
|
69 myarchive = zipfile.ZipFile(tab_files[key], 'r')
|
|
70 for i in myarchive.namelist():
|
|
71 tab_files[i]=tmp_input_dir+"/"+i
|
|
72
|
|
73 # extract all files
|
|
74 myarchive.extractall(tmp_input_dir)
|
|
75
|
|
76 #remove compress file from the tab
|
|
77 del tab_files[key]
|
|
78
|
|
79 #test if is tar.gz file
|
|
80 else:
|
|
81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ):
|
|
82 # extract all files names and added it in the tab
|
|
83 mygzfile = tarfile.open(tab_files[key], 'r')
|
|
84
|
|
85 for i in mygzfile.getnames():
|
|
86 tab_files[i]=tmp_input_dir+"/"+i
|
|
87
|
|
88 # extract all files
|
|
89 mygzfile.extractall(tmp_input_dir)
|
|
90
|
|
91 #remove compress file from the tab
|
|
92 del tab_files[key]
|
|
93
|
|
94
|
|
95
|
|
96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir):
|
|
97
|
|
98 for key in tab_files.keys():
|
|
99 #print "file single: "+key+" -> "+tab_files[key]
|
|
100 #create a sym_link in our temp dir
|
|
101 if not os.path.exists(tmp_input_dir+'/'+key):
|
|
102 os.symlink(tab_files[key], tmp_input_dir+'/'+key)
|
|
103
|
|
104
|
|
105 """
|
|
106
|
|
107 PROCESS RADTAGS METHODS
|
|
108
|
|
109 generate_additional_file(tmp_output_dir, output_archive)
|
|
110
|
|
111 """
|
|
112
|
|
113 def change_outputs_procrad_name(tmp_output_dir, sample_name):
|
|
114
|
|
115 list_files = glob.glob(tmp_output_dir+'/*')
|
|
116
|
|
117 for file in list_files:
|
|
118 # change sample name
|
|
119 new_file_name=os.path.basename(file.replace("_",".").replace("sample", sample_name))
|
|
120
|
|
121 # transform .fa -> .fasta or .fq->.fastq
|
|
122 if os.path.splitext(new_file_name)[1] == ".fa":
|
|
123 new_file_name = os.path.splitext(new_file_name)[0]+'.fasta'
|
|
124 else:
|
|
125 new_file_name = os.path.splitext(new_file_name)[0]+'.fastq'
|
|
126
|
|
127 shutil.move(tmp_output_dir+'/'+os.path.basename(file), tmp_output_dir+'/'+new_file_name)
|
|
128
|
|
129
|
|
130 def generate_additional_archive_file(tmp_output_dir, output_archive):
|
|
131
|
|
132 list_files = glob.glob(tmp_output_dir+'/*')
|
|
133
|
|
134 myzip=zipfile.ZipFile("archive.zip.temp", 'w', allowZip64=True)
|
|
135
|
|
136 # for each fastq file
|
|
137 for fastq_file in list_files:
|
|
138 # add file to the archive output
|
|
139 myzip.write(fastq_file, os.path.basename(fastq_file))
|
|
140
|
|
141 shutil.move("archive.zip.temp", output_archive)
|
|
142
|
|
143
|
|
144 """
|
|
145
|
|
146 DENOVOMAP METHODS
|
|
147
|
|
148 check_fastq_extension_and_add(tab_files, tmp_input_dir)
|
|
149
|
|
150 """
|
|
151
|
|
152 def check_fastq_extension_and_add(tab_files, tmp_input_dir):
|
|
153
|
|
154 # for each file
|
|
155 for key in tab_files.keys():
|
|
156
|
|
157 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key):
|
|
158 # open the file
|
|
159 myfastxfile=open(tab_files[key], 'r')
|
|
160
|
|
161 # get the header
|
|
162 line = myfastxfile.readline()
|
|
163 line = line.strip()
|
|
164
|
|
165 # fasta rapid test
|
|
166 if line.startswith( '>' ):
|
|
167 tab_files[key+".fasta"]=tab_files[key]
|
|
168 del tab_files[key]
|
|
169 # fastq rapid test
|
|
170 elif line.startswith( '@' ):
|
|
171 tab_files[key+".fq"]=tab_files[key]
|
|
172 del tab_files[key]
|
|
173 else:
|
|
174 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file"
|
|
175
|
|
176 myfastxfile.close()
|
|
177
|
|
178
|
|
179 """
|
|
180
|
|
181 REFMAP METHODS
|
|
182
|
|
183 """
|
|
184
|
|
185 def check_sam_extension_and_add(tab_files, tmp_input_dir):
|
|
186
|
|
187 # for each file
|
|
188 for key in tab_files.keys():
|
|
189
|
|
190 if not re.search("\.sam$", key):
|
|
191 # add the extension
|
|
192 tab_files[key+".sam"]=tab_files[key]
|
|
193 del tab_files[key]
|
|
194
|
|
195
|
|
196
|
|
197
|
|
198
|
|
199
|
|
200 """
|
|
201
|
|
202 PREPARE POPULATION MAP METHODS
|
|
203
|
|
204 generate_popmap_for_denovo(tab_files, infos_file, pop_map)
|
|
205 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map)
|
|
206
|
|
207
|
|
208 """
|
|
209 def generate_popmap_for_denovo(tab_files, infos_file, pop_map):
|
|
210
|
|
211 # initiate the dict : barcode -> tab[seq]
|
|
212 fq_name_for_barcode={}
|
|
213
|
|
214 for key in tab_files:
|
|
215 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", key).groups(0)[0]
|
|
216 fq_name_for_barcode[single_barcode]=key
|
|
217
|
|
218 # open the infos file and output file
|
|
219 my_open_info_file=open(infos_file, 'r')
|
|
220 my_output_file=open(pop_map, 'w')
|
|
221
|
|
222 # conversion tab for population to integer
|
|
223 pop_to_int=[]
|
|
224
|
|
225 # write infos into the final output
|
|
226 for line in my_open_info_file:
|
|
227
|
|
228 parse_line=re.search("(^[ATCG]+)\t(.*)", line.strip())
|
|
229
|
|
230 if not parse_line:
|
|
231 print "[WARNING] Wrong input infos file structure : "+line
|
|
232 else:
|
|
233 barcode=parse_line.groups(1)[0]
|
|
234 population_name=parse_line.groups(1)[1]
|
|
235
|
|
236 # if its the first meet with the population
|
|
237 if population_name not in pop_to_int:
|
|
238 pop_to_int.append(population_name)
|
|
239
|
|
240 # manage ext if present, because the population map file should not have the ext
|
|
241 if re.search("(\.fq$|\.fastq$)", fq_name_for_barcode[barcode]):
|
|
242 fqfile=os.path.splitext(fq_name_for_barcode[barcode])[0]
|
|
243 else:
|
|
244 fqfile=fq_name_for_barcode[barcode]
|
|
245
|
|
246
|
|
247 # write in the file
|
|
248 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(population_name))+"\n")
|
|
249
|
|
250 # close files
|
|
251 my_output_file.close()
|
|
252 my_open_info_file.close()
|
|
253
|
|
254
|
|
255
|
|
256
|
|
257 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map):
|
|
258
|
|
259 # initiate the dict : barcode -> tab[seq]
|
|
260 seq_id_for_barcode={}
|
|
261
|
|
262 # initiate the dict : barcode -> sam_name
|
|
263 sam_name_for_barcode={}
|
|
264
|
|
265 ### Parse fastqfiles ###
|
|
266 # insert my barcode into a tab with sequences ID associated
|
|
267 for fastq_file in tab_fq_files.keys():
|
|
268 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", fastq_file).groups(0)[0]
|
|
269
|
|
270 # open the fasq file
|
|
271 open_fastqfile=open(tab_fq_files[fastq_file], 'r')
|
|
272
|
|
273 # for each line, get the seq ID
|
|
274 tab_seq_id=[]
|
|
275 for line in open_fastqfile:
|
|
276 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line)
|
|
277 if my_match_seqID:
|
|
278 tab_seq_id.append(my_match_seqID.groups(0)[0])
|
|
279
|
|
280 # push in a dict the tab of seqID for the current barcode
|
|
281 seq_id_for_barcode[single_barcode]=tab_seq_id
|
|
282
|
|
283
|
|
284 ### Parse samfiles and get the first seq id ###
|
|
285 for sam_file in tab_sam_files.keys():
|
|
286
|
|
287 # open the sam file
|
|
288 open_samfile=open(tab_sam_files[sam_file], 'r')
|
|
289
|
|
290 # get the first seq id
|
|
291 first_seq_id=''
|
|
292 for line in open_samfile:
|
|
293 if not re.search("^@", line):
|
|
294 first_seq_id=line.split("\t")[0]
|
|
295 break
|
|
296
|
|
297
|
|
298 # map with seq_id_for_barcode structure
|
|
299 for barcode in seq_id_for_barcode:
|
|
300 for seq in seq_id_for_barcode[barcode]:
|
|
301 if seq == first_seq_id:
|
|
302 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode
|
|
303 sam_name_for_barcode[barcode]=sam_file
|
|
304 break
|
|
305
|
|
306 # open the infos file and output file
|
|
307 my_open_info_file=open(infos_file, 'r')
|
|
308 my_output_file=open(pop_map, 'w')
|
|
309
|
|
310 # conversion tab for population to integer
|
|
311 pop_to_int=[]
|
|
312
|
|
313 # write infos into the final output
|
|
314 for line in my_open_info_file:
|
|
315 parse_line=re.search("(^[ATCG]+)\t(.*)", line)
|
|
316
|
|
317 if not parse_line:
|
|
318 print "[WARNING] Wrong input infos file structure : "+line
|
|
319 else:
|
|
320
|
|
321 # if its the first meet with the population
|
|
322 if parse_line.groups(1)[1] not in pop_to_int:
|
|
323 pop_to_int.append(parse_line.groups(1)[1])
|
|
324
|
|
325 # manage ext if present, because the population map file should not have the ext
|
|
326 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]):
|
|
327 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0]
|
|
328 else:
|
|
329 samfile=sam_name_for_barcode[parse_line.groups(1)[0]]
|
|
330
|
|
331 # write in the file
|
|
332 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")
|
|
333
|
|
334 # close files
|
|
335 my_output_file.close()
|
|
336 my_open_info_file.close()
|
|
337
|
|
338
|
|
339 """
|
|
340
|
|
341 STACKS POPULATION
|
|
342
|
|
343
|
|
344 """
|
|
345
|
|
346
|
|
347 def extract_compress_files(myfile, tmp_input_dir):
|
|
348
|
|
349 #test if is zip file
|
|
350 if (check_zip( myfile )):
|
|
351
|
|
352 # extract all files names and added it in the tab
|
|
353 myarchive = zipfile.ZipFile(myfile, 'r')
|
|
354
|
|
355 # extract all files
|
|
356 myarchive.extractall(tmp_input_dir)
|
|
357
|
|
358
|
|
359 #test if is tar.gz file
|
|
360 else:
|
|
361 # extract all files names and added it in the tab
|
|
362 mygzfile = tarfile.open(myfile, 'r')
|
|
363
|
|
364 # extract all files
|
|
365 mygzfile.extractall(tmp_input_dir)
|
|
366
|
|
367
|