Mercurial > repos > cmonjeau > stacks
comparison stacks.py @ 0:d6ba40f6c824
first commit
| author | cmonjeau |
|---|---|
| date | Mon, 24 Aug 2015 09:29:12 +0000 |
| parents | |
| children | 0e0ff9e9c761 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d6ba40f6c824 |
|---|---|
| 1 """ | |
| 2 | |
| 3 STACKS METHODS FOR GALAXY | |
| 4 | |
| 5 Created by Cyril Monjeaud & Yvan Le Bras | |
| 6 Cyril.Monjeaud@irisa.fr | |
| 7 yvan.le_bras@irisa.fr | |
| 8 | |
| 9 Last modifications : 01/22/2014 | |
| 10 | |
| 11 | |
| 12 """ | |
| 13 | |
| 14 import os, sys, re, shutil | |
| 15 import glob | |
| 16 import collections | |
| 17 import gzip, zipfile, tarfile | |
| 18 import subprocess | |
| 19 from galaxy.datatypes.checkers import * | |
| 20 | |
| 21 | |
| 22 """ | |
| 23 | |
| 24 STACKS COMMON METHODS | |
| 25 | |
| 26 galaxy_config_to_tabfiles(input_config) | |
| 27 galaxy_config_to_tabfiles_for_STACKS(input_config) | |
| 28 extract_compress_files_from_tabfiles(tab_files, tmp_input_dir) | |
| 29 create_symlinks_from_tabfiles(tab_files, tmp_input_dir) | |
| 30 | |
| 31 """ | |
| 32 def galaxy_config_to_tabfiles(input_config): | |
| 33 | |
| 34 tab_files=collections.OrderedDict() | |
| 35 for line in open(input_config, "r").readlines(): | |
| 36 if line.strip() != '': | |
| 37 extract=line.strip().split("::") | |
| 38 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] | |
| 39 | |
| 40 # tabfiles[name]-> path | |
| 41 return tab_files | |
| 42 | |
| 43 | |
| 44 def galaxy_config_to_tabfiles_for_STACKS(input_config): | |
| 45 | |
| 46 tab_files=collections.OrderedDict() | |
| 47 for line in open(input_config, "r").readlines(): | |
| 48 if line.strip() != '': | |
| 49 extract=line.strip().split("::") | |
| 50 parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0]) | |
| 51 # rename galaxy name in a short name | |
| 52 if parse_name: | |
| 53 extract[0]=parse_name.groups(1)[0] | |
| 54 | |
| 55 tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] | |
| 56 | |
| 57 # tabfiles[name]-> path | |
| 58 return tab_files | |
| 59 | |
| 60 | |
| 61 def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir): | |
| 62 | |
| 63 # for each file | |
| 64 for key in tab_files.keys(): | |
| 65 #test if is zip file | |
| 66 if (check_zip( tab_files[key] )): | |
| 67 | |
| 68 # extract all files names and added it in the tab | |
| 69 myarchive = zipfile.ZipFile(tab_files[key], 'r') | |
| 70 for i in myarchive.namelist(): | |
| 71 tab_files[i]=tmp_input_dir+"/"+i | |
| 72 | |
| 73 # extract all files | |
| 74 myarchive.extractall(tmp_input_dir) | |
| 75 | |
| 76 #remove compress file from the tab | |
| 77 del tab_files[key] | |
| 78 | |
| 79 #test if is tar.gz file | |
| 80 else: | |
| 81 if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ): | |
| 82 # extract all files names and added it in the tab | |
| 83 mygzfile = tarfile.open(tab_files[key], 'r') | |
| 84 | |
| 85 for i in mygzfile.getnames(): | |
| 86 tab_files[i]=tmp_input_dir+"/"+i | |
| 87 | |
| 88 # extract all files | |
| 89 mygzfile.extractall(tmp_input_dir) | |
| 90 | |
| 91 #remove compress file from the tab | |
| 92 del tab_files[key] | |
| 93 | |
| 94 | |
| 95 | |
| 96 def create_symlinks_from_tabfiles(tab_files, tmp_input_dir): | |
| 97 | |
| 98 for key in tab_files.keys(): | |
| 99 #print "file single: "+key+" -> "+tab_files[key] | |
| 100 #create a sym_link in our temp dir | |
| 101 if not os.path.exists(tmp_input_dir+'/'+key): | |
| 102 os.symlink(tab_files[key], tmp_input_dir+'/'+key) | |
| 103 | |
| 104 | |
| 105 """ | |
| 106 | |
| 107 PROCESS RADTAGS METHODS | |
| 108 | |
| 109 generate_additional_file(tmp_output_dir, output_archive) | |
| 110 | |
| 111 """ | |
| 112 | |
| 113 def change_outputs_procrad_name(tmp_output_dir, sample_name): | |
| 114 | |
| 115 list_files = glob.glob(tmp_output_dir+'/*') | |
| 116 | |
| 117 for file in list_files: | |
| 118 # change sample name | |
| 119 new_file_name=os.path.basename(file.replace("_",".").replace("sample", sample_name)) | |
| 120 | |
| 121 # transform .fa -> .fasta or .fq->.fastq | |
| 122 if os.path.splitext(new_file_name)[1] == ".fa": | |
| 123 new_file_name = os.path.splitext(new_file_name)[0]+'.fasta' | |
| 124 else: | |
| 125 new_file_name = os.path.splitext(new_file_name)[0]+'.fastq' | |
| 126 | |
| 127 shutil.move(tmp_output_dir+'/'+os.path.basename(file), tmp_output_dir+'/'+new_file_name) | |
| 128 | |
| 129 | |
| 130 def generate_additional_archive_file(tmp_output_dir, output_archive): | |
| 131 | |
| 132 list_files = glob.glob(tmp_output_dir+'/*') | |
| 133 | |
| 134 myzip=zipfile.ZipFile("archive.zip.temp", 'w', allowZip64=True) | |
| 135 | |
| 136 # for each fastq file | |
| 137 for fastq_file in list_files: | |
| 138 # add file to the archive output | |
| 139 myzip.write(fastq_file, os.path.basename(fastq_file)) | |
| 140 | |
| 141 shutil.move("archive.zip.temp", output_archive) | |
| 142 | |
| 143 | |
| 144 """ | |
| 145 | |
| 146 DENOVOMAP METHODS | |
| 147 | |
| 148 check_fastq_extension_and_add(tab_files, tmp_input_dir) | |
| 149 | |
| 150 """ | |
| 151 | |
| 152 def check_fastq_extension_and_add(tab_files, tmp_input_dir): | |
| 153 | |
| 154 # for each file | |
| 155 for key in tab_files.keys(): | |
| 156 | |
| 157 if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key): | |
| 158 # open the file | |
| 159 myfastxfile=open(tab_files[key], 'r') | |
| 160 | |
| 161 # get the header | |
| 162 line = myfastxfile.readline() | |
| 163 line = line.strip() | |
| 164 | |
| 165 # fasta rapid test | |
| 166 if line.startswith( '>' ): | |
| 167 tab_files[key+".fasta"]=tab_files[key] | |
| 168 del tab_files[key] | |
| 169 # fastq rapid test | |
| 170 elif line.startswith( '@' ): | |
| 171 tab_files[key+".fq"]=tab_files[key] | |
| 172 del tab_files[key] | |
| 173 else: | |
| 174 print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file" | |
| 175 | |
| 176 myfastxfile.close() | |
| 177 | |
| 178 | |
| 179 """ | |
| 180 | |
| 181 REFMAP METHODS | |
| 182 | |
| 183 """ | |
| 184 | |
| 185 def check_sam_extension_and_add(tab_files, tmp_input_dir): | |
| 186 | |
| 187 # for each file | |
| 188 for key in tab_files.keys(): | |
| 189 | |
| 190 if not re.search("\.sam$", key): | |
| 191 # add the extension | |
| 192 tab_files[key+".sam"]=tab_files[key] | |
| 193 del tab_files[key] | |
| 194 | |
| 195 | |
| 196 | |
| 197 | |
| 198 | |
| 199 | |
| 200 """ | |
| 201 | |
| 202 PREPARE POPULATION MAP METHODS | |
| 203 | |
| 204 generate_popmap_for_denovo(tab_files, infos_file, pop_map) | |
| 205 generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map) | |
| 206 | |
| 207 | |
| 208 """ | |
| 209 def generate_popmap_for_denovo(tab_files, infos_file, pop_map): | |
| 210 | |
| 211 # initiate the dict : barcode -> tab[seq] | |
| 212 fq_name_for_barcode={} | |
| 213 | |
| 214 for key in tab_files: | |
| 215 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", key).groups(0)[0] | |
| 216 fq_name_for_barcode[single_barcode]=key | |
| 217 | |
| 218 # open the infos file and output file | |
| 219 my_open_info_file=open(infos_file, 'r') | |
| 220 my_output_file=open(pop_map, 'w') | |
| 221 | |
| 222 # conversion tab for population to integer | |
| 223 pop_to_int=[] | |
| 224 | |
| 225 # write infos into the final output | |
| 226 for line in my_open_info_file: | |
| 227 | |
| 228 parse_line=re.search("(^[ATCG]+)\t(.*)", line.strip()) | |
| 229 | |
| 230 if not parse_line: | |
| 231 print "[WARNING] Wrong input infos file structure : "+line | |
| 232 else: | |
| 233 barcode=parse_line.groups(1)[0] | |
| 234 population_name=parse_line.groups(1)[1] | |
| 235 | |
| 236 # if its the first meet with the population | |
| 237 if population_name not in pop_to_int: | |
| 238 pop_to_int.append(population_name) | |
| 239 | |
| 240 # manage ext if present, because the population map file should not have the ext | |
| 241 if re.search("(\.fq$|\.fastq$)", fq_name_for_barcode[barcode]): | |
| 242 fqfile=os.path.splitext(fq_name_for_barcode[barcode])[0] | |
| 243 else: | |
| 244 fqfile=fq_name_for_barcode[barcode] | |
| 245 | |
| 246 | |
| 247 # write in the file | |
| 248 my_output_file.write(fqfile+"\t"+str(pop_to_int.index(population_name))+"\n") | |
| 249 | |
| 250 # close files | |
| 251 my_output_file.close() | |
| 252 my_open_info_file.close() | |
| 253 | |
| 254 | |
| 255 | |
| 256 | |
| 257 def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map): | |
| 258 | |
| 259 # initiate the dict : barcode -> tab[seq] | |
| 260 seq_id_for_barcode={} | |
| 261 | |
| 262 # initiate the dict : barcode -> sam_name | |
| 263 sam_name_for_barcode={} | |
| 264 | |
| 265 ### Parse fastqfiles ### | |
| 266 # insert my barcode into a tab with sequences ID associated | |
| 267 for fastq_file in tab_fq_files.keys(): | |
| 268 single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", fastq_file).groups(0)[0] | |
| 269 | |
| 270 # open the fasq file | |
| 271 open_fastqfile=open(tab_fq_files[fastq_file], 'r') | |
| 272 | |
| 273 # for each line, get the seq ID | |
| 274 tab_seq_id=[] | |
| 275 for line in open_fastqfile: | |
| 276 my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line) | |
| 277 if my_match_seqID: | |
| 278 tab_seq_id.append(my_match_seqID.groups(0)[0]) | |
| 279 | |
| 280 # push in a dict the tab of seqID for the current barcode | |
| 281 seq_id_for_barcode[single_barcode]=tab_seq_id | |
| 282 | |
| 283 | |
| 284 ### Parse samfiles and get the first seq id ### | |
| 285 for sam_file in tab_sam_files.keys(): | |
| 286 | |
| 287 # open the sam file | |
| 288 open_samfile=open(tab_sam_files[sam_file], 'r') | |
| 289 | |
| 290 # get the first seq id | |
| 291 first_seq_id='' | |
| 292 for line in open_samfile: | |
| 293 if not re.search("^@", line): | |
| 294 first_seq_id=line.split("\t")[0] | |
| 295 break | |
| 296 | |
| 297 | |
| 298 # map with seq_id_for_barcode structure | |
| 299 for barcode in seq_id_for_barcode: | |
| 300 for seq in seq_id_for_barcode[barcode]: | |
| 301 if seq == first_seq_id: | |
| 302 #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode | |
| 303 sam_name_for_barcode[barcode]=sam_file | |
| 304 break | |
| 305 | |
| 306 # open the infos file and output file | |
| 307 my_open_info_file=open(infos_file, 'r') | |
| 308 my_output_file=open(pop_map, 'w') | |
| 309 | |
| 310 # conversion tab for population to integer | |
| 311 pop_to_int=[] | |
| 312 | |
| 313 # write infos into the final output | |
| 314 for line in my_open_info_file: | |
| 315 parse_line=re.search("(^[ATCG]+)\t(.*)", line) | |
| 316 | |
| 317 if not parse_line: | |
| 318 print "[WARNING] Wrong input infos file structure : "+line | |
| 319 else: | |
| 320 | |
| 321 # if its the first meet with the population | |
| 322 if parse_line.groups(1)[1] not in pop_to_int: | |
| 323 pop_to_int.append(parse_line.groups(1)[1]) | |
| 324 | |
| 325 # manage ext if present, because the population map file should not have the ext | |
| 326 if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]): | |
| 327 samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0] | |
| 328 else: | |
| 329 samfile=sam_name_for_barcode[parse_line.groups(1)[0]] | |
| 330 | |
| 331 # write in the file | |
| 332 my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") | |
| 333 | |
| 334 # close files | |
| 335 my_output_file.close() | |
| 336 my_open_info_file.close() | |
| 337 | |
| 338 | |
| 339 """ | |
| 340 | |
| 341 STACKS POPULATION | |
| 342 | |
| 343 | |
| 344 """ | |
| 345 | |
| 346 | |
| 347 def extract_compress_files(myfile, tmp_input_dir): | |
| 348 | |
| 349 #test if is zip file | |
| 350 if (check_zip( myfile )): | |
| 351 | |
| 352 # extract all files names and added it in the tab | |
| 353 myarchive = zipfile.ZipFile(myfile, 'r') | |
| 354 | |
| 355 # extract all files | |
| 356 myarchive.extractall(tmp_input_dir) | |
| 357 | |
| 358 | |
| 359 #test if is tar.gz file | |
| 360 else: | |
| 361 # extract all files names and added it in the tab | |
| 362 mygzfile = tarfile.open(myfile, 'r') | |
| 363 | |
| 364 # extract all files | |
| 365 mygzfile.extractall(tmp_input_dir) | |
| 366 | |
| 367 |
