Mercurial > repos > cmonjeau > stacks
diff stacks.py @ 0:d6ba40f6c824
first commit
author | cmonjeau |
---|---|
date | Mon, 24 Aug 2015 09:29:12 +0000 |
parents | |
children | 0e0ff9e9c761 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/stacks.py Mon Aug 24 09:29:12 2015 +0000 @@ -0,0 +1,367 @@ +""" + +STACKS METHODS FOR GALAXY + +Created by Cyril Monjeaud & Yvan Le Bras +Cyril.Monjeaud@irisa.fr +yvan.le_bras@irisa.fr + +Last modifications : 01/22/2014 + + +""" + +import os, sys, re, shutil +import glob +import collections +import gzip, zipfile, tarfile +import subprocess +from galaxy.datatypes.checkers import * + + +""" + +STACKS COMMON METHODS + +galaxy_config_to_tabfiles(input_config) +galaxy_config_to_tabfiles_for_STACKS(input_config) +extract_compress_files_from_tabfiles(tab_files, tmp_input_dir) +create_symlinks_from_tabfiles(tab_files, tmp_input_dir) + +""" +def galaxy_config_to_tabfiles(input_config): + + tab_files=collections.OrderedDict() + for line in open(input_config, "r").readlines(): + if line.strip() != '': + extract=line.strip().split("::") + tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] + + # tabfiles[name]-> path + return tab_files + + +def galaxy_config_to_tabfiles_for_STACKS(input_config): + + tab_files=collections.OrderedDict() + for line in open(input_config, "r").readlines(): + if line.strip() != '': + extract=line.strip().split("::") + parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0]) + # rename galaxy name in a short name + if parse_name: + extract[0]=parse_name.groups(1)[0] + + tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] + + # tabfiles[name]-> path + return tab_files + + +def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir): + + # for each file + for key in tab_files.keys(): + #test if is zip file + if (check_zip( tab_files[key] )): + + # extract all files names and added it in the tab + myarchive = zipfile.ZipFile(tab_files[key], 'r') + for i in myarchive.namelist(): + tab_files[i]=tmp_input_dir+"/"+i + + # extract all files + myarchive.extractall(tmp_input_dir) + + #remove compress file from the tab + del tab_files[key] + + #test if is tar.gz file + else: + if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ): + # extract all files names and added it in the tab + mygzfile = tarfile.open(tab_files[key], 'r') + + for i in mygzfile.getnames(): + tab_files[i]=tmp_input_dir+"/"+i + + # extract all files + mygzfile.extractall(tmp_input_dir) + + #remove compress file from the tab + del tab_files[key] + + + +def create_symlinks_from_tabfiles(tab_files, tmp_input_dir): + + for key in tab_files.keys(): + #print "file single: "+key+" -> "+tab_files[key] + #create a sym_link in our temp dir + if not os.path.exists(tmp_input_dir+'/'+key): + os.symlink(tab_files[key], tmp_input_dir+'/'+key) + + +""" + +PROCESS RADTAGS METHODS + +generate_additional_file(tmp_output_dir, output_archive) + +""" + +def change_outputs_procrad_name(tmp_output_dir, sample_name): + + list_files = glob.glob(tmp_output_dir+'/*') + + for file in list_files: + # change sample name + new_file_name=os.path.basename(file.replace("_",".").replace("sample", sample_name)) + + # transform .fa -> .fasta or .fq->.fastq + if os.path.splitext(new_file_name)[1] == ".fa": + new_file_name = os.path.splitext(new_file_name)[0]+'.fasta' + else: + new_file_name = os.path.splitext(new_file_name)[0]+'.fastq' + + shutil.move(tmp_output_dir+'/'+os.path.basename(file), tmp_output_dir+'/'+new_file_name) + + +def generate_additional_archive_file(tmp_output_dir, output_archive): + + list_files = glob.glob(tmp_output_dir+'/*') + + myzip=zipfile.ZipFile("archive.zip.temp", 'w', allowZip64=True) + + # for each fastq file + for fastq_file in list_files: + # add file to the archive output + myzip.write(fastq_file, os.path.basename(fastq_file)) + + shutil.move("archive.zip.temp", output_archive) + + +""" + +DENOVOMAP METHODS + +check_fastq_extension_and_add(tab_files, tmp_input_dir) + +""" + +def check_fastq_extension_and_add(tab_files, tmp_input_dir): + + # for each file + for key in tab_files.keys(): + + if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key): + # open the file + myfastxfile=open(tab_files[key], 'r') + + # get the header + line = myfastxfile.readline() + line = line.strip() + + # fasta rapid test + if line.startswith( '>' ): + tab_files[key+".fasta"]=tab_files[key] + del tab_files[key] + # fastq rapid test + elif line.startswith( '@' ): + tab_files[key+".fq"]=tab_files[key] + del tab_files[key] + else: + print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file" + + myfastxfile.close() + + +""" + +REFMAP METHODS + +""" + +def check_sam_extension_and_add(tab_files, tmp_input_dir): + + # for each file + for key in tab_files.keys(): + + if not re.search("\.sam$", key): + # add the extension + tab_files[key+".sam"]=tab_files[key] + del tab_files[key] + + + + + + +""" + +PREPARE POPULATION MAP METHODS + +generate_popmap_for_denovo(tab_files, infos_file, pop_map) +generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map) + + +""" +def generate_popmap_for_denovo(tab_files, infos_file, pop_map): + + # initiate the dict : barcode -> tab[seq] + fq_name_for_barcode={} + + for key in tab_files: + single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", key).groups(0)[0] + fq_name_for_barcode[single_barcode]=key + + # open the infos file and output file + my_open_info_file=open(infos_file, 'r') + my_output_file=open(pop_map, 'w') + + # conversion tab for population to integer + pop_to_int=[] + + # write infos into the final output + for line in my_open_info_file: + + parse_line=re.search("(^[ATCG]+)\t(.*)", line.strip()) + + if not parse_line: + print "[WARNING] Wrong input infos file structure : "+line + else: + barcode=parse_line.groups(1)[0] + population_name=parse_line.groups(1)[1] + + # if its the first meet with the population + if population_name not in pop_to_int: + pop_to_int.append(population_name) + + # manage ext if present, because the population map file should not have the ext + if re.search("(\.fq$|\.fastq$)", fq_name_for_barcode[barcode]): + fqfile=os.path.splitext(fq_name_for_barcode[barcode])[0] + else: + fqfile=fq_name_for_barcode[barcode] + + + # write in the file + my_output_file.write(fqfile+"\t"+str(pop_to_int.index(population_name))+"\n") + + # close files + my_output_file.close() + my_open_info_file.close() + + + + +def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map): + + # initiate the dict : barcode -> tab[seq] + seq_id_for_barcode={} + + # initiate the dict : barcode -> sam_name + sam_name_for_barcode={} + + ### Parse fastqfiles ### + # insert my barcode into a tab with sequences ID associated + for fastq_file in tab_fq_files.keys(): + single_barcode=re.search("([ATCG]*)(\.fq|\.fastq)", fastq_file).groups(0)[0] + + # open the fasq file + open_fastqfile=open(tab_fq_files[fastq_file], 'r') + + # for each line, get the seq ID + tab_seq_id=[] + for line in open_fastqfile: + my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line) + if my_match_seqID: + tab_seq_id.append(my_match_seqID.groups(0)[0]) + + # push in a dict the tab of seqID for the current barcode + seq_id_for_barcode[single_barcode]=tab_seq_id + + + ### Parse samfiles and get the first seq id ### + for sam_file in tab_sam_files.keys(): + + # open the sam file + open_samfile=open(tab_sam_files[sam_file], 'r') + + # get the first seq id + first_seq_id='' + for line in open_samfile: + if not re.search("^@", line): + first_seq_id=line.split("\t")[0] + break + + + # map with seq_id_for_barcode structure + for barcode in seq_id_for_barcode: + for seq in seq_id_for_barcode[barcode]: + if seq == first_seq_id: + #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode + sam_name_for_barcode[barcode]=sam_file + break + + # open the infos file and output file + my_open_info_file=open(infos_file, 'r') + my_output_file=open(pop_map, 'w') + + # conversion tab for population to integer + pop_to_int=[] + + # write infos into the final output + for line in my_open_info_file: + parse_line=re.search("(^[ATCG]+)\t(.*)", line) + + if not parse_line: + print "[WARNING] Wrong input infos file structure : "+line + else: + + # if its the first meet with the population + if parse_line.groups(1)[1] not in pop_to_int: + pop_to_int.append(parse_line.groups(1)[1]) + + # manage ext if present, because the population map file should not have the ext + if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]): + samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0] + else: + samfile=sam_name_for_barcode[parse_line.groups(1)[0]] + + # write in the file + my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") + + # close files + my_output_file.close() + my_open_info_file.close() + + +""" + +STACKS POPULATION + + +""" + + +def extract_compress_files(myfile, tmp_input_dir): + + #test if is zip file + if (check_zip( myfile )): + + # extract all files names and added it in the tab + myarchive = zipfile.ZipFile(myfile, 'r') + + # extract all files + myarchive.extractall(tmp_input_dir) + + + #test if is tar.gz file + else: + # extract all files names and added it in the tab + mygzfile = tarfile.open(myfile, 'r') + + # extract all files + mygzfile.extractall(tmp_input_dir) + +