Mercurial > repos > ylebrascnrs > structure
comparison structure-923cc9e6aa30/adlib.py @ 0:2c0b270dae70 draft default tip
Uploaded
| author | ylebrascnrs |
|---|---|
| date | Thu, 14 Sep 2017 08:33:05 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:2c0b270dae70 |
|---|---|
| 1 """ | |
| 2 | |
| 3 STACKS METHODS FOR GALAXY | |
| 4 | |
| 5 Created by Cyril Monjeaud & Yvan Le Bras | |
| 6 Cyril.Monjeaud@irisa.fr | |
| 7 yvan.le_bras@irisa.fr | |
| 8 | |
| 9 Last modifications : 01/22/2014 | |
| 10 | |
| 11 | |
| 12 """ | |
| 13 | |
| 14 import os, sys, re | |
| 15 import glob | |
| 16 import collections | |
| 17 import gzip, zipfile, tarfile | |
| 18 import subprocess | |
| 19 from galaxy.datatypes.checkers import * | |
| 20 | |
| 21 | |
| 22 """ | |
| 23 | |
| 24 STACKS COMMON METHODS | |
| 25 | |
| 26 galaxy_config_to_tabfiles(input_config) | |
| 27 galaxy_config_to_tabfiles_for_STACKS(input_config) | |
| 28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir) | |
| 29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir) | |
| 30 | |
| 31 """ | |
| 32 def galaxy_config_to_tabfiles(input_config): | |
| 33 | |
| 34 tab_files={} | |
| 35 for line in open(input_config, "r").readlines(): | |
| 36 if line.strip() != '': | |
| 37 extract=line.strip().split("::") | |
| 38 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] | |
| 39 | |
| 40 # tabfiles[name]-> path | |
| 41 return tab_files | |
| 42 | |
| 43 | |
| 44 def galaxy_config_to_tabfiles_for_STACKS(input_config): | |
| 45 | |
| 46 tab_files={} | |
| 47 for line in open(input_config, "r").readlines(): | |
| 48 if line.strip() != '': | |
| 49 extract=line.strip().split("::") | |
| 50 parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0]) | |
| 51 # rename galaxy name in a short name | |
| 52 if parse_name: | |
| 53 extract[0]=parse_name.groups(1)[0] | |
| 54 | |
| 55 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] | |
| 56 | |
| 57 # tabfiles[name]-> path | |
| 58 return tab_files | |
| 59 | |
| 60 | |
| 61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir): | |
| 62 | |
| 63 # for each file | |
| 64 for key in tab_files.keys(): | |
| 65 #test if is zip file | |
| 66 if (check_zip( tab_files[key] )): | |
| 67 | |
| 68 # extract all files names and added it in the tab | |
| 69 myarchive = zipfile.ZipFile(tab_files[key], 'r') | |
| 70 for i in myarchive.namelist(): | |
| 71 tab_files[i]=tmp_input_dir+"/"+i | |
| 72 | |
| 73 # extract all files | |
| 74 myarchive.extractall(tmp_input_dir) | |
| 75 | |
| 76 #remove compress file from the tab | |
| 77 del tab_files[key] | |
| 78 | |
| 79 #test if is tar.gz file | |
| 80 else: | |
| 81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ): | |
| 82 # extract all files names and added it in the tab | |
| 83 mygzfile = tarfile.open(tab_files[key], 'r') | |
| 84 | |
| 85 for i in mygzfile.getnames(): | |
| 86 tab_files[i]=tmp_input_dir+"/"+i | |
| 87 | |
| 88 # extract all files | |
| 89 mygzfile.extractall(tmp_input_dir) | |
| 90 | |
| 91 #remove compress file from the tab | |
| 92 del tab_files[key] | |
| 93 | |
| 94 | |
| 95 | |
| 96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir): | |
| 97 | |
| 98 for key in tab_files.keys(): | |
| 99 #print "file single: "+key+" -> "+tab_files[key] | |
| 100 #create a sym_link in our temp dir | |
| 101 if not os.path.exists(tmp_input_dir+'/'+key): | |
| 102 cmd = 'ln -s '+tab_files[key]+' '+tmp_input_dir+'/'+key | |
| 103 proc = subprocess.Popen( args=cmd, shell=True ) | |
| 104 returncode = proc.wait() | |
| 105 | |
| 106 | |
| 107 | |
| 108 """ | |
| 109 | |
| 110 PROCESS RADTAGS METHODS | |
| 111 | |
| 112 generate_additional_file(tmp_output_dir, output_archive) | |
| 113 | |
| 114 """ | |
| 115 | |
| 116 def change_outputs_procrad_name(tmp_output_dir, sample_name): | |
| 117 | |
| 118 list_files = glob.glob(tmp_output_dir+'/*') | |
| 119 for fastq_file in list_files: | |
| 120 os.chdir(tmp_output_dir) | |
| 121 new_fastq_name=os.path.basename(fastq_file.replace("_",".").replace("sample", sample_name)) | |
| 122 os.system('mv '+os.path.basename(fastq_file)+' '+new_fastq_name) | |
| 123 | |
| 124 | |
| 125 | |
| 126 | |
| 127 def generate_additional_archive_file(tmp_output_dir, output_archive): | |
| 128 | |
| 129 list_files = glob.glob(tmp_output_dir+'/*') | |
| 130 myzip=zipfile.ZipFile(output_archive, 'w') | |
| 131 | |
| 132 # for each fastq file | |
| 133 for fastq_file in list_files: | |
| 134 | |
| 135 # add file to the archive output | |
| 136 os.chdir(tmp_output_dir) | |
| 137 myzip.write(os.path.basename(fastq_file)) | |
| 138 | |
| 139 | |
| 140 """ | |
| 141 | |
| 142 DENOVOMAP METHODS | |
| 143 | |
| 144 check_fastq_extension_and_add(tab_files, tmp_input_dir) | |
| 145 | |
| 146 """ | |
| 147 | |
| 148 def check_fastq_extension_and_add(tab_files, tmp_input_dir): | |
| 149 | |
| 150 # for each file | |
| 151 for key in tab_files.keys(): | |
| 152 | |
| 153 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key): | |
| 154 # open the file | |
| 155 myfastxfile=open(tab_files[key], 'r') | |
| 156 | |
| 157 # get the header | |
| 158 line = myfastxfile.readline() | |
| 159 line = line.strip() | |
| 160 | |
| 161 # fasta rapid test | |
| 162 if line.startswith( '>' ): | |
| 163 tab_files[key+".fasta"]=tab_files[key] | |
| 164 del tab_files[key] | |
| 165 # fastq rapid test | |
| 166 elif line.startswith( '@' ): | |
| 167 tab_files[key+".fq"]=tab_files[key] | |
| 168 del tab_files[key] | |
| 169 else: | |
| 170 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file" | |
| 171 | |
| 172 myfastxfile.close() | |
| 173 | |
| 174 | |
| 175 """ | |
| 176 | |
| 177 REFMAP METHODS | |
| 178 | |
| 179 """ | |
| 180 | |
| 181 def check_sam_extension_and_add(tab_files, tmp_input_dir): | |
| 182 | |
| 183 # for each file | |
| 184 for key in tab_files.keys(): | |
| 185 | |
| 186 if not re.search("\.sam$", key): | |
| 187 # add the extension | |
| 188 tab_files[key+".sam"]=tab_files[key] | |
| 189 del tab_files[key] | |
| 190 | |
| 191 | |
| 192 | |
| 193 | |
| 194 | |
| 195 | |
| 196 """ | |
| 197 | |
| 198 PREPARE POPULATION MAP METHODS | |
| 199 | |
| 200 generate_popmap_for_denovo(tab_files, infos_file, pop_map) | |
| 201 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map) | |
| 202 | |
| 203 | |
| 204 """ | |
| 205 def generate_popmap_for_denovo(tab_files, infos_file, pop_map): | |
| 206 | |
| 207 # initiate the dict : barcode -> tab[seq] | |
| 208 fq_name_for_barcode={} | |
| 209 | |
| 210 for key in tab_files: | |
| 211 single_barcode=re.search("([ATCG]*)\.fq", key).groups(0)[0] | |
| 212 fq_name_for_barcode[single_barcode]=key | |
| 213 | |
| 214 # open the infos file and output file | |
| 215 my_open_info_file=open(infos_file, 'r') | |
| 216 my_output_file=open(pop_map, 'w') | |
| 217 | |
| 218 # conversion tab for population to integer | |
| 219 pop_to_int=[] | |
| 220 | |
| 221 # write infos into the final output | |
| 222 for line in my_open_info_file: | |
| 223 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line) | |
| 224 | |
| 225 # if its the first meet with the population | |
| 226 if parse_line.groups(1)[1] not in pop_to_int: | |
| 227 pop_to_int.append(parse_line.groups(1)[1]) | |
| 228 | |
| 229 # manage ext if present, because the population map file should not have the ext | |
| 230 if re.search("\.fq", fq_name_for_barcode[parse_line.groups(1)[0]]) or re.search("\.fastq", sam_name_for_barcode[parse_line.groups(1)[0]]): | |
| 231 fqfile=os.path.splitext(fq_name_for_barcode[parse_line.groups(1)[0]])[0] | |
| 232 else: | |
| 233 fqfile=fq_name_for_barcode[parse_line.groups(1)[0]] | |
| 234 | |
| 235 | |
| 236 # write in the file | |
| 237 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") | |
| 238 | |
| 239 # close files | |
| 240 my_output_file.close() | |
| 241 my_open_info_file.close() | |
| 242 | |
| 243 | |
| 244 | |
| 245 | |
| 246 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map): | |
| 247 | |
| 248 # initiate the dict : barcode -> tab[seq] | |
| 249 seq_id_for_barcode={} | |
| 250 | |
| 251 # initiate the dict : barcode -> sam_name | |
| 252 sam_name_for_barcode={} | |
| 253 | |
| 254 ### Parse fastqfiles ### | |
| 255 # insert my barcode into a tab with sequences ID associated | |
| 256 for fastq_file in tab_fq_files.keys(): | |
| 257 single_barcode=re.search("([ATCG]*)\.fq", fastq_file).groups(0)[0] | |
| 258 | |
| 259 # open the fasq file | |
| 260 open_fastqfile=open(tab_fq_files[fastq_file], 'r') | |
| 261 | |
| 262 # for each line, get the seq ID | |
| 263 tab_seq_id=[] | |
| 264 for line in open_fastqfile: | |
| 265 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line) | |
| 266 if my_match_seqID: | |
| 267 tab_seq_id.append(my_match_seqID.groups(0)[0]) | |
| 268 | |
| 269 # push in a dict the tab of seqID for the current barcode | |
| 270 seq_id_for_barcode[single_barcode]=tab_seq_id | |
| 271 | |
| 272 | |
| 273 ### Parse samfiles and get the first seq id ### | |
| 274 for sam_file in tab_sam_files.keys(): | |
| 275 | |
| 276 # open the sam file | |
| 277 open_samfile=open(tab_sam_files[sam_file], 'r') | |
| 278 | |
| 279 # get the first seq id | |
| 280 first_seq_id='' | |
| 281 for line in open_samfile: | |
| 282 if not re.search("^@", line): | |
| 283 first_seq_id=line.split("\t")[0] | |
| 284 break | |
| 285 | |
| 286 | |
| 287 # map with seq_id_for_barcode structure | |
| 288 for barcode in seq_id_for_barcode: | |
| 289 for seq in seq_id_for_barcode[barcode]: | |
| 290 if seq == first_seq_id: | |
| 291 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode | |
| 292 sam_name_for_barcode[barcode]=sam_file | |
| 293 break | |
| 294 | |
| 295 # open the infos file and output file | |
| 296 my_open_info_file=open(infos_file, 'r') | |
| 297 my_output_file=open(pop_map, 'w') | |
| 298 | |
| 299 # conversion tab for population to integer | |
| 300 pop_to_int=[] | |
| 301 | |
| 302 # write infos into the final output | |
| 303 for line in my_open_info_file: | |
| 304 parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line) | |
| 305 | |
| 306 # if its the first meet with the population | |
| 307 if parse_line.groups(1)[1] not in pop_to_int: | |
| 308 pop_to_int.append(parse_line.groups(1)[1]) | |
| 309 | |
| 310 # manage ext if present, because the population map file should not have the ext | |
| 311 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]): | |
| 312 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0] | |
| 313 else: | |
| 314 samfile=sam_name_for_barcode[parse_line.groups(1)[0]] | |
| 315 | |
| 316 # write in the file | |
| 317 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") | |
| 318 | |
| 319 # close files | |
| 320 my_output_file.close() | |
| 321 my_open_info_file.close() | |
| 322 | |
| 323 | |
| 324 """ | |
| 325 | |
| 326 STACKS POPULATION | |
| 327 | |
| 328 | |
| 329 """ | |
| 330 | |
| 331 | |
| 332 def extract_compress_files(myfile, tmp_input_dir): | |
| 333 | |
| 334 #test if is zip file | |
| 335 if (check_zip( myfile )): | |
| 336 | |
| 337 # extract all files names and added it in the tab | |
| 338 myarchive = zipfile.ZipFile(myfile, 'r') | |
| 339 | |
| 340 # extract all files | |
| 341 myarchive.extractall(tmp_input_dir) | |
| 342 | |
| 343 | |
| 344 #test if is tar.gz file | |
| 345 else: | |
| 346 # extract all files names and added it in the tab | |
| 347 mygzfile = tarfile.open(myfile, 'r') | |
| 348 | |
| 349 # extract all files | |
| 350 mygzfile.extractall(tmp_input_dir) | |
| 351 | |
| 352 | |
| 353 | |
| 354 | |
| 355 |
