annotate structure-923cc9e6aa30/adlib.py @ 0:2c0b270dae70 draft default tip

Uploaded
author ylebrascnrs
date Thu, 14 Sep 2017 08:33:05 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
1 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
2
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
3 STACKS METHODS FOR GALAXY
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
4
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
5 Created by Cyril Monjeaud & Yvan Le Bras
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
6 Cyril.Monjeaud@irisa.fr
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
7 yvan.le_bras@irisa.fr
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
8
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
9 Last modifications : 01/22/2014
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
10
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
11
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
12 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
13
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
14 import os, sys, re
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
15 import glob
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
16 import collections
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
17 import gzip, zipfile, tarfile
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
18 import subprocess
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
19 from galaxy.datatypes.checkers import *
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
20
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
21
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
22 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
23
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
24 STACKS COMMON METHODS
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
25
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
26 galaxy_config_to_tabfiles(input_config)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
27 galaxy_config_to_tabfiles_for_STACKS(input_config)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
30
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
31 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
32 def galaxy_config_to_tabfiles(input_config):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
33
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
34 tab_files={}
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
35 for line in open(input_config, "r").readlines():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
36 if line.strip() != '':
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
37 extract=line.strip().split("::")
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
38 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
39
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
40 # tabfiles[name]-> path
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
41 return tab_files
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
42
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
43
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
44 def galaxy_config_to_tabfiles_for_STACKS(input_config):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
45
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
46 tab_files={}
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
47 for line in open(input_config, "r").readlines():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
48 if line.strip() != '':
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
49 extract=line.strip().split("::")
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
50 parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0])
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
51 # rename galaxy name in a short name
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
52 if parse_name:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
53 extract[0]=parse_name.groups(1)[0]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
54
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
55 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
56
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
57 # tabfiles[name]-> path
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
58 return tab_files
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
59
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
60
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
62
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
63 # for each file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
64 for key in tab_files.keys():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
65 #test if is zip file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
66 if (check_zip( tab_files[key] )):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
67
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
68 # extract all files names and added it in the tab
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
69 myarchive = zipfile.ZipFile(tab_files[key], 'r')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
70 for i in myarchive.namelist():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
71 tab_files[i]=tmp_input_dir+"/"+i
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
72
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
73 # extract all files
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
74 myarchive.extractall(tmp_input_dir)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
75
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
76 #remove compress file from the tab
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
77 del tab_files[key]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
78
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
79 #test if is tar.gz file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
80 else:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
82 # extract all files names and added it in the tab
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
83 mygzfile = tarfile.open(tab_files[key], 'r')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
84
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
85 for i in mygzfile.getnames():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
86 tab_files[i]=tmp_input_dir+"/"+i
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
87
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
88 # extract all files
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
89 mygzfile.extractall(tmp_input_dir)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
90
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
91 #remove compress file from the tab
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
92 del tab_files[key]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
93
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
94
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
95
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
97
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
98 for key in tab_files.keys():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
99 #print "file single: "+key+" -> "+tab_files[key]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
100 #create a sym_link in our temp dir
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
101 if not os.path.exists(tmp_input_dir+'/'+key):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
102 cmd = 'ln -s '+tab_files[key]+' '+tmp_input_dir+'/'+key
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
103 proc = subprocess.Popen( args=cmd, shell=True )
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
104 returncode = proc.wait()
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
105
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
106
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
107
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
108 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
109
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
110 PROCESS RADTAGS METHODS
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
111
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
112 generate_additional_file(tmp_output_dir, output_archive)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
113
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
114 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
115
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
116 def change_outputs_procrad_name(tmp_output_dir, sample_name):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
117
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
118 list_files = glob.glob(tmp_output_dir+'/*')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
119 for fastq_file in list_files:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
120 os.chdir(tmp_output_dir)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
121 new_fastq_name=os.path.basename(fastq_file.replace("_",".").replace("sample", sample_name))
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
122 os.system('mv '+os.path.basename(fastq_file)+' '+new_fastq_name)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
123
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
124
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
125
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
126
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
127 def generate_additional_archive_file(tmp_output_dir, output_archive):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
128
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
129 list_files = glob.glob(tmp_output_dir+'/*')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
130 myzip=zipfile.ZipFile(output_archive, 'w')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
131
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
132 # for each fastq file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
133 for fastq_file in list_files:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
134
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
135 # add file to the archive output
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
136 os.chdir(tmp_output_dir)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
137 myzip.write(os.path.basename(fastq_file))
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
138
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
139
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
140 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
141
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
142 DENOVOMAP METHODS
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
143
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
144 check_fastq_extension_and_add(tab_files, tmp_input_dir)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
145
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
146 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
147
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
148 def check_fastq_extension_and_add(tab_files, tmp_input_dir):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
149
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
150 # for each file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
151 for key in tab_files.keys():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
152
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
153 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
154 # open the file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
155 myfastxfile=open(tab_files[key], 'r')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
156
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
157 # get the header
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
158 line = myfastxfile.readline()
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
159 line = line.strip()
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
160
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
161 # fasta rapid test
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
162 if line.startswith( '>' ):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
163 tab_files[key+".fasta"]=tab_files[key]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
164 del tab_files[key]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
165 # fastq rapid test
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
166 elif line.startswith( '@' ):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
167 tab_files[key+".fq"]=tab_files[key]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
168 del tab_files[key]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
169 else:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
170 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file"
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
171
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
172 myfastxfile.close()
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
173
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
174
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
175 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
176
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
177 REFMAP METHODS
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
178
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
179 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
180
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
181 def check_sam_extension_and_add(tab_files, tmp_input_dir):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
182
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
183 # for each file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
184 for key in tab_files.keys():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
185
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
186 if not re.search("\.sam$", key):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
187 # add the extension
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
188 tab_files[key+".sam"]=tab_files[key]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
189 del tab_files[key]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
190
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
191
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
192
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
193
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
194
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
195
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
196 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
197
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
198 PREPARE POPULATION MAP METHODS
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
199
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
200 generate_popmap_for_denovo(tab_files, infos_file, pop_map)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
201 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
202
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
203
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
204 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
205 def generate_popmap_for_denovo(tab_files, infos_file, pop_map):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
206
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
207 # initiate the dict : barcode -> tab[seq]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
208 fq_name_for_barcode={}
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
209
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
210 for key in tab_files:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
211 single_barcode=re.search("([ATCG]*)\.fq", key).groups(0)[0]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
212 fq_name_for_barcode[single_barcode]=key
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
213
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
214 # open the infos file and output file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
215 my_open_info_file=open(infos_file, 'r')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
216 my_output_file=open(pop_map, 'w')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
217
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
218 # conversion tab for population to integer
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
219 pop_to_int=[]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
220
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
221 # write infos into the final output
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
222 for line in my_open_info_file:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
223 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
224
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
225 # if its the first meet with the population
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
226 if parse_line.groups(1)[1] not in pop_to_int:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
227 pop_to_int.append(parse_line.groups(1)[1])
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
228
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
229 # manage ext if present, because the population map file should not have the ext
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
230 if re.search("\.fq", fq_name_for_barcode[parse_line.groups(1)[0]]) or re.search("\.fastq", sam_name_for_barcode[parse_line.groups(1)[0]]):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
231 fqfile=os.path.splitext(fq_name_for_barcode[parse_line.groups(1)[0]])[0]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
232 else:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
233 fqfile=fq_name_for_barcode[parse_line.groups(1)[0]]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
234
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
235
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
236 # write in the file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
237 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
238
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
239 # close files
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
240 my_output_file.close()
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
241 my_open_info_file.close()
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
242
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
243
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
244
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
245
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
246 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
247
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
248 # initiate the dict : barcode -> tab[seq]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
249 seq_id_for_barcode={}
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
250
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
251 # initiate the dict : barcode -> sam_name
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
252 sam_name_for_barcode={}
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
253
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
254 ### Parse fastqfiles ###
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
255 # insert my barcode into a tab with sequences ID associated
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
256 for fastq_file in tab_fq_files.keys():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
257 single_barcode=re.search("([ATCG]*)\.fq", fastq_file).groups(0)[0]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
258
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
259 # open the fasq file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
260 open_fastqfile=open(tab_fq_files[fastq_file], 'r')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
261
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
262 # for each line, get the seq ID
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
263 tab_seq_id=[]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
264 for line in open_fastqfile:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
265 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
266 if my_match_seqID:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
267 tab_seq_id.append(my_match_seqID.groups(0)[0])
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
268
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
269 # push in a dict the tab of seqID for the current barcode
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
270 seq_id_for_barcode[single_barcode]=tab_seq_id
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
271
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
272
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
273 ### Parse samfiles and get the first seq id ###
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
274 for sam_file in tab_sam_files.keys():
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
275
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
276 # open the sam file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
277 open_samfile=open(tab_sam_files[sam_file], 'r')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
278
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
279 # get the first seq id
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
280 first_seq_id=''
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
281 for line in open_samfile:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
282 if not re.search("^@", line):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
283 first_seq_id=line.split("\t")[0]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
284 break
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
285
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
286
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
287 # map with seq_id_for_barcode structure
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
288 for barcode in seq_id_for_barcode:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
289 for seq in seq_id_for_barcode[barcode]:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
290 if seq == first_seq_id:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
291 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
292 sam_name_for_barcode[barcode]=sam_file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
293 break
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
294
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
295 # open the infos file and output file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
296 my_open_info_file=open(infos_file, 'r')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
297 my_output_file=open(pop_map, 'w')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
298
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
299 # conversion tab for population to integer
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
300 pop_to_int=[]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
301
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
302 # write infos into the final output
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
303 for line in my_open_info_file:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
304 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
305
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
306 # if its the first meet with the population
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
307 if parse_line.groups(1)[1] not in pop_to_int:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
308 pop_to_int.append(parse_line.groups(1)[1])
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
309
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
310 # manage ext if present, because the population map file should not have the ext
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
311 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
312 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
313 else:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
314 samfile=sam_name_for_barcode[parse_line.groups(1)[0]]
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
315
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
316 # write in the file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
317 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
318
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
319 # close files
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
320 my_output_file.close()
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
321 my_open_info_file.close()
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
322
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
323
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
324 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
325
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
326 STACKS POPULATION
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
327
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
328
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
329 """
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
330
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
331
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
332 def extract_compress_files(myfile, tmp_input_dir):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
333
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
334 #test if is zip file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
335 if (check_zip( myfile )):
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
336
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
337 # extract all files names and added it in the tab
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
338 myarchive = zipfile.ZipFile(myfile, 'r')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
339
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
340 # extract all files
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
341 myarchive.extractall(tmp_input_dir)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
342
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
343
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
344 #test if is tar.gz file
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
345 else:
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
346 # extract all files names and added it in the tab
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
347 mygzfile = tarfile.open(myfile, 'r')
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
348
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
349 # extract all files
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
350 mygzfile.extractall(tmp_input_dir)
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
351
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
352
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
353
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
354
2c0b270dae70 Uploaded
ylebrascnrs
parents:
diff changeset
355