comparison structure-923cc9e6aa30/adlib.py @ 0:2c0b270dae70 draft default tip

Uploaded
author ylebrascnrs
date Thu, 14 Sep 2017 08:33:05 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2c0b270dae70
1 """
2
3 STACKS METHODS FOR GALAXY
4
5 Created by Cyril Monjeaud & Yvan Le Bras
6 Cyril.Monjeaud@irisa.fr
7 yvan.le_bras@irisa.fr
8
9 Last modifications : 01/22/2014
10
11
12 """
13
14 import os, sys, re
15 import glob
16 import collections
17 import gzip, zipfile, tarfile
18 import subprocess
19 from galaxy.datatypes.checkers import *
20
21
22 """
23
24 STACKS COMMON METHODS
25
26 galaxy_config_to_tabfiles(input_config)
27 galaxy_config_to_tabfiles_for_STACKS(input_config)
28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir)
29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir)
30
31 """
32 def galaxy_config_to_tabfiles(input_config):
33
34 tab_files={}
35 for line in open(input_config, "r").readlines():
36 if line.strip() != '':
37 extract=line.strip().split("::")
38 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
39
40 # tabfiles[name]-> path
41 return tab_files
42
43
44 def galaxy_config_to_tabfiles_for_STACKS(input_config):
45
46 tab_files={}
47 for line in open(input_config, "r").readlines():
48 if line.strip() != '':
49 extract=line.strip().split("::")
50 parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0])
51 # rename galaxy name in a short name
52 if parse_name:
53 extract[0]=parse_name.groups(1)[0]
54
55 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
56
57 # tabfiles[name]-> path
58 return tab_files
59
60
61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir):
62
63 # for each file
64 for key in tab_files.keys():
65 #test if is zip file
66 if (check_zip( tab_files[key] )):
67
68 # extract all files names and added it in the tab
69 myarchive = zipfile.ZipFile(tab_files[key], 'r')
70 for i in myarchive.namelist():
71 tab_files[i]=tmp_input_dir+"/"+i
72
73 # extract all files
74 myarchive.extractall(tmp_input_dir)
75
76 #remove compress file from the tab
77 del tab_files[key]
78
79 #test if is tar.gz file
80 else:
81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ):
82 # extract all files names and added it in the tab
83 mygzfile = tarfile.open(tab_files[key], 'r')
84
85 for i in mygzfile.getnames():
86 tab_files[i]=tmp_input_dir+"/"+i
87
88 # extract all files
89 mygzfile.extractall(tmp_input_dir)
90
91 #remove compress file from the tab
92 del tab_files[key]
93
94
95
96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir):
97
98 for key in tab_files.keys():
99 #print "file single: "+key+" -> "+tab_files[key]
100 #create a sym_link in our temp dir
101 if not os.path.exists(tmp_input_dir+'/'+key):
102 cmd = 'ln -s '+tab_files[key]+' '+tmp_input_dir+'/'+key
103 proc = subprocess.Popen( args=cmd, shell=True )
104 returncode = proc.wait()
105
106
107
108 """
109
110 PROCESS RADTAGS METHODS
111
112 generate_additional_file(tmp_output_dir, output_archive)
113
114 """
115
116 def change_outputs_procrad_name(tmp_output_dir, sample_name):
117
118 list_files = glob.glob(tmp_output_dir+'/*')
119 for fastq_file in list_files:
120 os.chdir(tmp_output_dir)
121 new_fastq_name=os.path.basename(fastq_file.replace("_",".").replace("sample", sample_name))
122 os.system('mv '+os.path.basename(fastq_file)+' '+new_fastq_name)
123
124
125
126
127 def generate_additional_archive_file(tmp_output_dir, output_archive):
128
129 list_files = glob.glob(tmp_output_dir+'/*')
130 myzip=zipfile.ZipFile(output_archive, 'w')
131
132 # for each fastq file
133 for fastq_file in list_files:
134
135 # add file to the archive output
136 os.chdir(tmp_output_dir)
137 myzip.write(os.path.basename(fastq_file))
138
139
140 """
141
142 DENOVOMAP METHODS
143
144 check_fastq_extension_and_add(tab_files, tmp_input_dir)
145
146 """
147
148 def check_fastq_extension_and_add(tab_files, tmp_input_dir):
149
150 # for each file
151 for key in tab_files.keys():
152
153 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key):
154 # open the file
155 myfastxfile=open(tab_files[key], 'r')
156
157 # get the header
158 line = myfastxfile.readline()
159 line = line.strip()
160
161 # fasta rapid test
162 if line.startswith( '>' ):
163 tab_files[key+".fasta"]=tab_files[key]
164 del tab_files[key]
165 # fastq rapid test
166 elif line.startswith( '@' ):
167 tab_files[key+".fq"]=tab_files[key]
168 del tab_files[key]
169 else:
170 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file"
171
172 myfastxfile.close()
173
174
175 """
176
177 REFMAP METHODS
178
179 """
180
181 def check_sam_extension_and_add(tab_files, tmp_input_dir):
182
183 # for each file
184 for key in tab_files.keys():
185
186 if not re.search("\.sam$", key):
187 # add the extension
188 tab_files[key+".sam"]=tab_files[key]
189 del tab_files[key]
190
191
192
193
194
195
196 """
197
198 PREPARE POPULATION MAP METHODS
199
200 generate_popmap_for_denovo(tab_files, infos_file, pop_map)
201 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map)
202
203
204 """
205 def generate_popmap_for_denovo(tab_files, infos_file, pop_map):
206
207 # initiate the dict : barcode -> tab[seq]
208 fq_name_for_barcode={}
209
210 for key in tab_files:
211 single_barcode=re.search("([ATCG]*)\.fq", key).groups(0)[0]
212 fq_name_for_barcode[single_barcode]=key
213
214 # open the infos file and output file
215 my_open_info_file=open(infos_file, 'r')
216 my_output_file=open(pop_map, 'w')
217
218 # conversion tab for population to integer
219 pop_to_int=[]
220
221 # write infos into the final output
222 for line in my_open_info_file:
223 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line)
224
225 # if its the first meet with the population
226 if parse_line.groups(1)[1] not in pop_to_int:
227 pop_to_int.append(parse_line.groups(1)[1])
228
229 # manage ext if present, because the population map file should not have the ext
230 if re.search("\.fq", fq_name_for_barcode[parse_line.groups(1)[0]]) or re.search("\.fastq", sam_name_for_barcode[parse_line.groups(1)[0]]):
231 fqfile=os.path.splitext(fq_name_for_barcode[parse_line.groups(1)[0]])[0]
232 else:
233 fqfile=fq_name_for_barcode[parse_line.groups(1)[0]]
234
235
236 # write in the file
237 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")
238
239 # close files
240 my_output_file.close()
241 my_open_info_file.close()
242
243
244
245
246 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map):
247
248 # initiate the dict : barcode -> tab[seq]
249 seq_id_for_barcode={}
250
251 # initiate the dict : barcode -> sam_name
252 sam_name_for_barcode={}
253
254 ### Parse fastqfiles ###
255 # insert my barcode into a tab with sequences ID associated
256 for fastq_file in tab_fq_files.keys():
257 single_barcode=re.search("([ATCG]*)\.fq", fastq_file).groups(0)[0]
258
259 # open the fasq file
260 open_fastqfile=open(tab_fq_files[fastq_file], 'r')
261
262 # for each line, get the seq ID
263 tab_seq_id=[]
264 for line in open_fastqfile:
265 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line)
266 if my_match_seqID:
267 tab_seq_id.append(my_match_seqID.groups(0)[0])
268
269 # push in a dict the tab of seqID for the current barcode
270 seq_id_for_barcode[single_barcode]=tab_seq_id
271
272
273 ### Parse samfiles and get the first seq id ###
274 for sam_file in tab_sam_files.keys():
275
276 # open the sam file
277 open_samfile=open(tab_sam_files[sam_file], 'r')
278
279 # get the first seq id
280 first_seq_id=''
281 for line in open_samfile:
282 if not re.search("^@", line):
283 first_seq_id=line.split("\t")[0]
284 break
285
286
287 # map with seq_id_for_barcode structure
288 for barcode in seq_id_for_barcode:
289 for seq in seq_id_for_barcode[barcode]:
290 if seq == first_seq_id:
291 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode
292 sam_name_for_barcode[barcode]=sam_file
293 break
294
295 # open the infos file and output file
296 my_open_info_file=open(infos_file, 'r')
297 my_output_file=open(pop_map, 'w')
298
299 # conversion tab for population to integer
300 pop_to_int=[]
301
302 # write infos into the final output
303 for line in my_open_info_file:
304 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line)
305
306 # if its the first meet with the population
307 if parse_line.groups(1)[1] not in pop_to_int:
308 pop_to_int.append(parse_line.groups(1)[1])
309
310 # manage ext if present, because the population map file should not have the ext
311 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]):
312 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0]
313 else:
314 samfile=sam_name_for_barcode[parse_line.groups(1)[0]]
315
316 # write in the file
317 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")
318
319 # close files
320 my_output_file.close()
321 my_open_info_file.close()
322
323
324 """
325
326 STACKS POPULATION
327
328
329 """
330
331
332 def extract_compress_files(myfile, tmp_input_dir):
333
334 #test if is zip file
335 if (check_zip( myfile )):
336
337 # extract all files names and added it in the tab
338 myarchive = zipfile.ZipFile(myfile, 'r')
339
340 # extract all files
341 myarchive.extractall(tmp_input_dir)
342
343
344 #test if is tar.gz file
345 else:
346 # extract all files names and added it in the tab
347 mygzfile = tarfile.open(myfile, 'r')
348
349 # extract all files
350 mygzfile.extractall(tmp_input_dir)
351
352
353
354
355