Mercurial > repos > ylebrascnrs > structure
comparison structure-923cc9e6aa30/adlib.py @ 0:2c0b270dae70 draft default tip
Uploaded
author | ylebrascnrs |
---|---|
date | Thu, 14 Sep 2017 08:33:05 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2c0b270dae70 |
---|---|
1 """ | |
2 | |
3 STACKS METHODS FOR GALAXY | |
4 | |
5 Created by Cyril Monjeaud & Yvan Le Bras | |
6 Cyril.Monjeaud@irisa.fr | |
7 yvan.le_bras@irisa.fr | |
8 | |
9 Last modifications : 01/22/2014 | |
10 | |
11 | |
12 """ | |
13 | |
14 import os, sys, re | |
15 import glob | |
16 import collections | |
17 import gzip, zipfile, tarfile | |
18 import subprocess | |
19 from galaxy.datatypes.checkers import * | |
20 | |
21 | |
22 """ | |
23 | |
24 STACKS COMMON METHODS | |
25 | |
26 galaxy_config_to_tabfiles(input_config) | |
27 galaxy_config_to_tabfiles_for_STACKS(input_config) | |
28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir) | |
29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir) | |
30 | |
31 """ | |
32 def galaxy_config_to_tabfiles(input_config): | |
33 | |
34 tab_files={} | |
35 for line in open(input_config, "r").readlines(): | |
36 if line.strip() != '': | |
37 extract=line.strip().split("::") | |
38 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] | |
39 | |
40 # tabfiles[name]-> path | |
41 return tab_files | |
42 | |
43 | |
44 def galaxy_config_to_tabfiles_for_STACKS(input_config): | |
45 | |
46 tab_files={} | |
47 for line in open(input_config, "r").readlines(): | |
48 if line.strip() != '': | |
49 extract=line.strip().split("::") | |
50 parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0]) | |
51 # rename galaxy name in a short name | |
52 if parse_name: | |
53 extract[0]=parse_name.groups(1)[0] | |
54 | |
55 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] | |
56 | |
57 # tabfiles[name]-> path | |
58 return tab_files | |
59 | |
60 | |
61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir): | |
62 | |
63 # for each file | |
64 for key in tab_files.keys(): | |
65 #test if is zip file | |
66 if (check_zip( tab_files[key] )): | |
67 | |
68 # extract all files names and added it in the tab | |
69 myarchive = zipfile.ZipFile(tab_files[key], 'r') | |
70 for i in myarchive.namelist(): | |
71 tab_files[i]=tmp_input_dir+"/"+i | |
72 | |
73 # extract all files | |
74 myarchive.extractall(tmp_input_dir) | |
75 | |
76 #remove compress file from the tab | |
77 del tab_files[key] | |
78 | |
79 #test if is tar.gz file | |
80 else: | |
81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ): | |
82 # extract all files names and added it in the tab | |
83 mygzfile = tarfile.open(tab_files[key], 'r') | |
84 | |
85 for i in mygzfile.getnames(): | |
86 tab_files[i]=tmp_input_dir+"/"+i | |
87 | |
88 # extract all files | |
89 mygzfile.extractall(tmp_input_dir) | |
90 | |
91 #remove compress file from the tab | |
92 del tab_files[key] | |
93 | |
94 | |
95 | |
96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir): | |
97 | |
98 for key in tab_files.keys(): | |
99 #print "file single: "+key+" -> "+tab_files[key] | |
100 #create a sym_link in our temp dir | |
101 if not os.path.exists(tmp_input_dir+'/'+key): | |
102 cmd = 'ln -s '+tab_files[key]+' '+tmp_input_dir+'/'+key | |
103 proc = subprocess.Popen( args=cmd, shell=True ) | |
104 returncode = proc.wait() | |
105 | |
106 | |
107 | |
108 """ | |
109 | |
110 PROCESS RADTAGS METHODS | |
111 | |
112 generate_additional_file(tmp_output_dir, output_archive) | |
113 | |
114 """ | |
115 | |
116 def change_outputs_procrad_name(tmp_output_dir, sample_name): | |
117 | |
118 list_files = glob.glob(tmp_output_dir+'/*') | |
119 for fastq_file in list_files: | |
120 os.chdir(tmp_output_dir) | |
121 new_fastq_name=os.path.basename(fastq_file.replace("_",".").replace("sample", sample_name)) | |
122 os.system('mv '+os.path.basename(fastq_file)+' '+new_fastq_name) | |
123 | |
124 | |
125 | |
126 | |
127 def generate_additional_archive_file(tmp_output_dir, output_archive): | |
128 | |
129 list_files = glob.glob(tmp_output_dir+'/*') | |
130 myzip=zipfile.ZipFile(output_archive, 'w') | |
131 | |
132 # for each fastq file | |
133 for fastq_file in list_files: | |
134 | |
135 # add file to the archive output | |
136 os.chdir(tmp_output_dir) | |
137 myzip.write(os.path.basename(fastq_file)) | |
138 | |
139 | |
140 """ | |
141 | |
142 DENOVOMAP METHODS | |
143 | |
144 check_fastq_extension_and_add(tab_files, tmp_input_dir) | |
145 | |
146 """ | |
147 | |
148 def check_fastq_extension_and_add(tab_files, tmp_input_dir): | |
149 | |
150 # for each file | |
151 for key in tab_files.keys(): | |
152 | |
153 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key): | |
154 # open the file | |
155 myfastxfile=open(tab_files[key], 'r') | |
156 | |
157 # get the header | |
158 line = myfastxfile.readline() | |
159 line = line.strip() | |
160 | |
161 # fasta rapid test | |
162 if line.startswith( '>' ): | |
163 tab_files[key+".fasta"]=tab_files[key] | |
164 del tab_files[key] | |
165 # fastq rapid test | |
166 elif line.startswith( '@' ): | |
167 tab_files[key+".fq"]=tab_files[key] | |
168 del tab_files[key] | |
169 else: | |
170 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file" | |
171 | |
172 myfastxfile.close() | |
173 | |
174 | |
175 """ | |
176 | |
177 REFMAP METHODS | |
178 | |
179 """ | |
180 | |
181 def check_sam_extension_and_add(tab_files, tmp_input_dir): | |
182 | |
183 # for each file | |
184 for key in tab_files.keys(): | |
185 | |
186 if not re.search("\.sam$", key): | |
187 # add the extension | |
188 tab_files[key+".sam"]=tab_files[key] | |
189 del tab_files[key] | |
190 | |
191 | |
192 | |
193 | |
194 | |
195 | |
196 """ | |
197 | |
198 PREPARE POPULATION MAP METHODS | |
199 | |
200 generate_popmap_for_denovo(tab_files, infos_file, pop_map) | |
201 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map) | |
202 | |
203 | |
204 """ | |
205 def generate_popmap_for_denovo(tab_files, infos_file, pop_map): | |
206 | |
207 # initiate the dict : barcode -> tab[seq] | |
208 fq_name_for_barcode={} | |
209 | |
210 for key in tab_files: | |
211 single_barcode=re.search("([ATCG]*)\.fq", key).groups(0)[0] | |
212 fq_name_for_barcode[single_barcode]=key | |
213 | |
214 # open the infos file and output file | |
215 my_open_info_file=open(infos_file, 'r') | |
216 my_output_file=open(pop_map, 'w') | |
217 | |
218 # conversion tab for population to integer | |
219 pop_to_int=[] | |
220 | |
221 # write infos into the final output | |
222 for line in my_open_info_file: | |
223 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line) | |
224 | |
225 # if its the first meet with the population | |
226 if parse_line.groups(1)[1] not in pop_to_int: | |
227 pop_to_int.append(parse_line.groups(1)[1]) | |
228 | |
229 # manage ext if present, because the population map file should not have the ext | |
230 if re.search("\.fq", fq_name_for_barcode[parse_line.groups(1)[0]]) or re.search("\.fastq", sam_name_for_barcode[parse_line.groups(1)[0]]): | |
231 fqfile=os.path.splitext(fq_name_for_barcode[parse_line.groups(1)[0]])[0] | |
232 else: | |
233 fqfile=fq_name_for_barcode[parse_line.groups(1)[0]] | |
234 | |
235 | |
236 # write in the file | |
237 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") | |
238 | |
239 # close files | |
240 my_output_file.close() | |
241 my_open_info_file.close() | |
242 | |
243 | |
244 | |
245 | |
246 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map): | |
247 | |
248 # initiate the dict : barcode -> tab[seq] | |
249 seq_id_for_barcode={} | |
250 | |
251 # initiate the dict : barcode -> sam_name | |
252 sam_name_for_barcode={} | |
253 | |
254 ### Parse fastqfiles ### | |
255 # insert my barcode into a tab with sequences ID associated | |
256 for fastq_file in tab_fq_files.keys(): | |
257 single_barcode=re.search("([ATCG]*)\.fq", fastq_file).groups(0)[0] | |
258 | |
259 # open the fasq file | |
260 open_fastqfile=open(tab_fq_files[fastq_file], 'r') | |
261 | |
262 # for each line, get the seq ID | |
263 tab_seq_id=[] | |
264 for line in open_fastqfile: | |
265 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line) | |
266 if my_match_seqID: | |
267 tab_seq_id.append(my_match_seqID.groups(0)[0]) | |
268 | |
269 # push in a dict the tab of seqID for the current barcode | |
270 seq_id_for_barcode[single_barcode]=tab_seq_id | |
271 | |
272 | |
273 ### Parse samfiles and get the first seq id ### | |
274 for sam_file in tab_sam_files.keys(): | |
275 | |
276 # open the sam file | |
277 open_samfile=open(tab_sam_files[sam_file], 'r') | |
278 | |
279 # get the first seq id | |
280 first_seq_id='' | |
281 for line in open_samfile: | |
282 if not re.search("^@", line): | |
283 first_seq_id=line.split("\t")[0] | |
284 break | |
285 | |
286 | |
287 # map with seq_id_for_barcode structure | |
288 for barcode in seq_id_for_barcode: | |
289 for seq in seq_id_for_barcode[barcode]: | |
290 if seq == first_seq_id: | |
291 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode | |
292 sam_name_for_barcode[barcode]=sam_file | |
293 break | |
294 | |
295 # open the infos file and output file | |
296 my_open_info_file=open(infos_file, 'r') | |
297 my_output_file=open(pop_map, 'w') | |
298 | |
299 # conversion tab for population to integer | |
300 pop_to_int=[] | |
301 | |
302 # write infos into the final output | |
303 for line in my_open_info_file: | |
304 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line) | |
305 | |
306 # if its the first meet with the population | |
307 if parse_line.groups(1)[1] not in pop_to_int: | |
308 pop_to_int.append(parse_line.groups(1)[1]) | |
309 | |
310 # manage ext if present, because the population map file should not have the ext | |
311 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]): | |
312 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0] | |
313 else: | |
314 samfile=sam_name_for_barcode[parse_line.groups(1)[0]] | |
315 | |
316 # write in the file | |
317 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") | |
318 | |
319 # close files | |
320 my_output_file.close() | |
321 my_open_info_file.close() | |
322 | |
323 | |
324 """ | |
325 | |
326 STACKS POPULATION | |
327 | |
328 | |
329 """ | |
330 | |
331 | |
332 def extract_compress_files(myfile, tmp_input_dir): | |
333 | |
334 #test if is zip file | |
335 if (check_zip( myfile )): | |
336 | |
337 # extract all files names and added it in the tab | |
338 myarchive = zipfile.ZipFile(myfile, 'r') | |
339 | |
340 # extract all files | |
341 myarchive.extractall(tmp_input_dir) | |
342 | |
343 | |
344 #test if is tar.gz file | |
345 else: | |
346 # extract all files names and added it in the tab | |
347 mygzfile = tarfile.open(myfile, 'r') | |
348 | |
349 # extract all files | |
350 mygzfile.extractall(tmp_input_dir) | |
351 | |
352 | |
353 | |
354 | |
355 |