annotate stacks.py @ 2:c9e10e0d6c10

fix discard file
author cmonjeau
date Mon, 24 Aug 2015 15:39:12 +0000
parents d6ba40f6c824
children 0e0ff9e9c761
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
1 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
2
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
3 STACKS METHODS FOR GALAXY
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
4
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
5 Created by Cyril Monjeaud & Yvan Le Bras
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
6 Cyril.Monjeaud@irisa.fr
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
7 yvan.le_bras@irisa.fr
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
8
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
9 Last modifications : 01/22/2014
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
10
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
11
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
12 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
13
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
14 import os, sys, re, shutil
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
15 import glob
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
16 import collections
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
17 import gzip, zipfile, tarfile
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
18 import subprocess
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
19 from galaxy.datatypes.checkers import *
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
20
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
21
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
22 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
23
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
24 STACKS COMMON METHODS
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
25
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
26 galaxy_config_to_tabfiles(input_config)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
27 galaxy_config_to_tabfiles_for_STACKS(input_config)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
30
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
31 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
32 def galaxy_config_to_tabfiles(input_config):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
33
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
34 tab_files=collections.OrderedDict()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
35 for line in open(input_config, "r").readlines():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
36 if line.strip() != '':
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
37 extract=line.strip().split("::")
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
38 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
39
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
40 # tabfiles[name]-> path
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
41 return tab_files
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
42
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
43
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
44 def galaxy_config_to_tabfiles_for_STACKS(input_config):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
45
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
46 tab_files=collections.OrderedDict()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
47 for line in open(input_config, "r").readlines():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
48 if line.strip() != '':
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
49 extract=line.strip().split("::")
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
50 parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0])
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
51 # rename galaxy name in a short name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
52 if parse_name:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
53 extract[0]=parse_name.groups(1)[0]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
54
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
55 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
56
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
57 # tabfiles[name]-> path
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
58 return tab_files
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
59
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
60
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
62
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
63 # for each file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
64 for key in tab_files.keys():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
65 #test if is zip file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
66 if (check_zip( tab_files[key] )):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
67
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
68 # extract all files names and added it in the tab
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
69 myarchive = zipfile.ZipFile(tab_files[key], 'r')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
70 for i in myarchive.namelist():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
71 tab_files[i]=tmp_input_dir+"/"+i
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
72
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
73 # extract all files
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
74 myarchive.extractall(tmp_input_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
75
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
76 #remove compress file from the tab
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
77 del tab_files[key]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
78
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
79 #test if is tar.gz file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
80 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
82 # extract all files names and added it in the tab
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
83 mygzfile = tarfile.open(tab_files[key], 'r')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
84
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
85 for i in mygzfile.getnames():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
86 tab_files[i]=tmp_input_dir+"/"+i
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
87
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
88 # extract all files
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
89 mygzfile.extractall(tmp_input_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
90
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
91 #remove compress file from the tab
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
92 del tab_files[key]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
93
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
94
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
95
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
97
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
98 for key in tab_files.keys():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
99 #print "file single: "+key+" -> "+tab_files[key]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
100 #create a sym_link in our temp dir
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
101 if not os.path.exists(tmp_input_dir+'/'+key):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
102 os.symlink(tab_files[key], tmp_input_dir+'/'+key)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
103
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
104
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
105 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
106
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
107 PROCESS RADTAGS METHODS
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
108
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
109 generate_additional_file(tmp_output_dir, output_archive)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
110
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
111 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
112
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
113 def change_outputs_procrad_name(tmp_output_dir, sample_name):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
114
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
115 list_files = glob.glob(tmp_output_dir+'/*')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
116
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
117 for file in list_files:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
118 # change sample name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
119 new_file_name=os.path.basename(file.replace("_",".").replace("sample", sample_name))
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
120
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
121 # transform .fa -> .fasta or .fq->.fastq
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
122 if os.path.splitext(new_file_name)[1] == ".fa":
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
123 new_file_name = os.path.splitext(new_file_name)[0]+'.fasta'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
124 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
125 new_file_name = os.path.splitext(new_file_name)[0]+'.fastq'
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
126
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
127 shutil.move(tmp_output_dir+'/'+os.path.basename(file), tmp_output_dir+'/'+new_file_name)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
128
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
129
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
130 def generate_additional_archive_file(tmp_output_dir, output_archive):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
131
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
132 list_files = glob.glob(tmp_output_dir+'/*')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
133
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
134 myzip=zipfile.ZipFile("archive.zip.temp", 'w', allowZip64=True)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
135
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
136 # for each fastq file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
137 for fastq_file in list_files:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
138 # add file to the archive output
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
139 myzip.write(fastq_file, os.path.basename(fastq_file))
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
140
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
141 shutil.move("archive.zip.temp", output_archive)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
142
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
143
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
144 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
145
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
146 DENOVOMAP METHODS
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
147
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
148 check_fastq_extension_and_add(tab_files, tmp_input_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
149
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
150 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
151
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
152 def check_fastq_extension_and_add(tab_files, tmp_input_dir):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
153
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
154 # for each file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
155 for key in tab_files.keys():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
156
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
157 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
158 # open the file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
159 myfastxfile=open(tab_files[key], 'r')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
160
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
161 # get the header
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
162 line = myfastxfile.readline()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
163 line = line.strip()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
164
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
165 # fasta rapid test
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
166 if line.startswith( '>' ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
167 tab_files[key+".fasta"]=tab_files[key]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
168 del tab_files[key]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
169 # fastq rapid test
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
170 elif line.startswith( '@' ):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
171 tab_files[key+".fq"]=tab_files[key]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
172 del tab_files[key]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
173 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
174 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file"
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
175
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
176 myfastxfile.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
177
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
178
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
179 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
180
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
181 REFMAP METHODS
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
182
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
183 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
184
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
185 def check_sam_extension_and_add(tab_files, tmp_input_dir):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
186
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
187 # for each file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
188 for key in tab_files.keys():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
189
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
190 if not re.search("\.sam$", key):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
191 # add the extension
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
192 tab_files[key+".sam"]=tab_files[key]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
193 del tab_files[key]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
194
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
195
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
196
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
197
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
198
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
199
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
200 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
201
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
202 PREPARE POPULATION MAP METHODS
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
203
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
204 generate_popmap_for_denovo(tab_files, infos_file, pop_map)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
205 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
206
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
207
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
208 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
209 def generate_popmap_for_denovo(tab_files, infos_file, pop_map):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
210
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
211 # initiate the dict : barcode -> tab[seq]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
212 fq_name_for_barcode={}
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
213
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
214 for key in tab_files:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
215 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", key).groups(0)[0]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
216 fq_name_for_barcode[single_barcode]=key
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
217
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
218 # open the infos file and output file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
219 my_open_info_file=open(infos_file, 'r')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
220 my_output_file=open(pop_map, 'w')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
221
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
222 # conversion tab for population to integer
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
223 pop_to_int=[]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
224
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
225 # write infos into the final output
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
226 for line in my_open_info_file:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
227
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
228 parse_line=re.search("(^[ATCG]+)\t(.*)", line.strip())
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
229
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
230 if not parse_line:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
231 print "[WARNING] Wrong input infos file structure : "+line
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
232 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
233 barcode=parse_line.groups(1)[0]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
234 population_name=parse_line.groups(1)[1]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
235
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
236 # if its the first meet with the population
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
237 if population_name not in pop_to_int:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
238 pop_to_int.append(population_name)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
239
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
240 # manage ext if present, because the population map file should not have the ext
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
241 if re.search("(\.fq$|\.fastq$)", fq_name_for_barcode[barcode]):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
242 fqfile=os.path.splitext(fq_name_for_barcode[barcode])[0]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
243 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
244 fqfile=fq_name_for_barcode[barcode]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
245
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
246
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
247 # write in the file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
248 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(population_name))+"\n")
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
249
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
250 # close files
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
251 my_output_file.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
252 my_open_info_file.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
253
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
254
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
255
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
256
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
257 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
258
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
259 # initiate the dict : barcode -> tab[seq]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
260 seq_id_for_barcode={}
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
261
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
262 # initiate the dict : barcode -> sam_name
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
263 sam_name_for_barcode={}
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
264
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
265 ### Parse fastqfiles ###
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
266 # insert my barcode into a tab with sequences ID associated
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
267 for fastq_file in tab_fq_files.keys():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
268 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", fastq_file).groups(0)[0]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
269
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
270 # open the fasq file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
271 open_fastqfile=open(tab_fq_files[fastq_file], 'r')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
272
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
273 # for each line, get the seq ID
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
274 tab_seq_id=[]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
275 for line in open_fastqfile:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
276 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
277 if my_match_seqID:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
278 tab_seq_id.append(my_match_seqID.groups(0)[0])
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
279
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
280 # push in a dict the tab of seqID for the current barcode
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
281 seq_id_for_barcode[single_barcode]=tab_seq_id
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
282
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
283
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
284 ### Parse samfiles and get the first seq id ###
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
285 for sam_file in tab_sam_files.keys():
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
286
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
287 # open the sam file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
288 open_samfile=open(tab_sam_files[sam_file], 'r')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
289
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
290 # get the first seq id
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
291 first_seq_id=''
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
292 for line in open_samfile:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
293 if not re.search("^@", line):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
294 first_seq_id=line.split("\t")[0]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
295 break
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
296
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
297
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
298 # map with seq_id_for_barcode structure
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
299 for barcode in seq_id_for_barcode:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
300 for seq in seq_id_for_barcode[barcode]:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
301 if seq == first_seq_id:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
302 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
303 sam_name_for_barcode[barcode]=sam_file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
304 break
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
305
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
306 # open the infos file and output file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
307 my_open_info_file=open(infos_file, 'r')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
308 my_output_file=open(pop_map, 'w')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
309
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
310 # conversion tab for population to integer
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
311 pop_to_int=[]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
312
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
313 # write infos into the final output
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
314 for line in my_open_info_file:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
315 parse_line=re.search("(^[ATCG]+)\t(.*)", line)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
316
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
317 if not parse_line:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
318 print "[WARNING] Wrong input infos file structure : "+line
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
319 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
320
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
321 # if its the first meet with the population
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
322 if parse_line.groups(1)[1] not in pop_to_int:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
323 pop_to_int.append(parse_line.groups(1)[1])
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
324
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
325 # manage ext if present, because the population map file should not have the ext
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
326 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
327 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
328 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
329 samfile=sam_name_for_barcode[parse_line.groups(1)[0]]
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
330
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
331 # write in the file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
332 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n")
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
333
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
334 # close files
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
335 my_output_file.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
336 my_open_info_file.close()
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
337
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
338
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
339 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
340
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
341 STACKS POPULATION
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
342
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
343
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
344 """
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
345
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
346
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
347 def extract_compress_files(myfile, tmp_input_dir):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
348
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
349 #test if is zip file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
350 if (check_zip( myfile )):
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
351
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
352 # extract all files names and added it in the tab
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
353 myarchive = zipfile.ZipFile(myfile, 'r')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
354
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
355 # extract all files
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
356 myarchive.extractall(tmp_input_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
357
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
358
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
359 #test if is tar.gz file
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
360 else:
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
361 # extract all files names and added it in the tab
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
362 mygzfile = tarfile.open(myfile, 'r')
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
363
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
364 # extract all files
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
365 mygzfile.extractall(tmp_input_dir)
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
366
d6ba40f6c824 first commit
cmonjeau
parents:
diff changeset
367