Mercurial > repos > ylebrascnrs > structure
view structure-923cc9e6aa30/adlib.py @ 0:2c0b270dae70 draft default tip
Uploaded
author | ylebrascnrs |
---|---|
date | Thu, 14 Sep 2017 08:33:05 -0400 |
parents | |
children |
line wrap: on
line source
""" STACKS METHODS FOR GALAXY Created by Cyril Monjeaud & Yvan Le Bras Cyril.Monjeaud@irisa.fr yvan.le_bras@irisa.fr Last modifications : 01/22/2014 """ import os, sys, re import glob import collections import gzip, zipfile, tarfile import subprocess from galaxy.datatypes.checkers import * """ STACKS COMMON METHODS galaxy_config_to_tabfiles(input_config) galaxy_config_to_tabfiles_for_STACKS(input_config) extract_compress_files_from_tabfiles(tab_files, tmp_input_dir) create_symlinks_from_tabfiles(tab_files, tmp_input_dir) """ def galaxy_config_to_tabfiles(input_config): tab_files={} for line in open(input_config, "r").readlines(): if line.strip() != '': extract=line.strip().split("::") tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] # tabfiles[name]-> path return tab_files def galaxy_config_to_tabfiles_for_STACKS(input_config): tab_files={} for line in open(input_config, "r").readlines(): if line.strip() != '': extract=line.strip().split("::") parse_name=re.search("^STACKS.*\((.*\.[ATCG]*\.fq)\)$", extract[0]) # rename galaxy name in a short name if parse_name: extract[0]=parse_name.groups(1)[0] tab_files[extract[0].replace(" (", ".").replace(" ", ".").replace(")", "").replace(":", ".").replace("/", ".")]=extract[1] # tabfiles[name]-> path return tab_files def extract_compress_files_from_tabfiles(tab_files, tmp_input_dir): # for each file for key in tab_files.keys(): #test if is zip file if (check_zip( tab_files[key] )): # extract all files names and added it in the tab myarchive = zipfile.ZipFile(tab_files[key], 'r') for i in myarchive.namelist(): tab_files[i]=tmp_input_dir+"/"+i # extract all files myarchive.extractall(tmp_input_dir) #remove compress file from the tab del tab_files[key] #test if is tar.gz file else: if tarfile.is_tarfile( tab_files[key] ) and check_gzip( tab_files[key] ): # extract all files names and added it in the tab mygzfile = tarfile.open(tab_files[key], 'r') for i in mygzfile.getnames(): tab_files[i]=tmp_input_dir+"/"+i # extract all files mygzfile.extractall(tmp_input_dir) #remove compress file from the tab del tab_files[key] def create_symlinks_from_tabfiles(tab_files, tmp_input_dir): for key in tab_files.keys(): #print "file single: "+key+" -> "+tab_files[key] #create a sym_link in our temp dir if not os.path.exists(tmp_input_dir+'/'+key): cmd = 'ln -s '+tab_files[key]+' '+tmp_input_dir+'/'+key proc = subprocess.Popen( args=cmd, shell=True ) returncode = proc.wait() """ PROCESS RADTAGS METHODS generate_additional_file(tmp_output_dir, output_archive) """ def change_outputs_procrad_name(tmp_output_dir, sample_name): list_files = glob.glob(tmp_output_dir+'/*') for fastq_file in list_files: os.chdir(tmp_output_dir) new_fastq_name=os.path.basename(fastq_file.replace("_",".").replace("sample", sample_name)) os.system('mv '+os.path.basename(fastq_file)+' '+new_fastq_name) def generate_additional_archive_file(tmp_output_dir, output_archive): list_files = glob.glob(tmp_output_dir+'/*') myzip=zipfile.ZipFile(output_archive, 'w') # for each fastq file for fastq_file in list_files: # add file to the archive output os.chdir(tmp_output_dir) myzip.write(os.path.basename(fastq_file)) """ DENOVOMAP METHODS check_fastq_extension_and_add(tab_files, tmp_input_dir) """ def check_fastq_extension_and_add(tab_files, tmp_input_dir): # for each file for key in tab_files.keys(): if not re.search("\.fq$", key) and not re.search("\.fastq$", key) and not re.search("\.fa$", key) and not re.search("\.fasta$", key): # open the file myfastxfile=open(tab_files[key], 'r') # get the header line = myfastxfile.readline() line = line.strip() # fasta rapid test if line.startswith( '>' ): tab_files[key+".fasta"]=tab_files[key] del tab_files[key] # fastq rapid test elif line.startswith( '@' ): tab_files[key+".fq"]=tab_files[key] del tab_files[key] else: print "[WARNING] : your input file "+key+" was not extension and is not recognize as a Fasta or Fastq file" myfastxfile.close() """ REFMAP METHODS """ def check_sam_extension_and_add(tab_files, tmp_input_dir): # for each file for key in tab_files.keys(): if not re.search("\.sam$", key): # add the extension tab_files[key+".sam"]=tab_files[key] del tab_files[key] """ PREPARE POPULATION MAP METHODS generate_popmap_for_denovo(tab_files, infos_file, pop_map) generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map) """ def generate_popmap_for_denovo(tab_files, infos_file, pop_map): # initiate the dict : barcode -> tab[seq] fq_name_for_barcode={} for key in tab_files: single_barcode=re.search("([ATCG]*)\.fq", key).groups(0)[0] fq_name_for_barcode[single_barcode]=key # open the infos file and output file my_open_info_file=open(infos_file, 'r') my_output_file=open(pop_map, 'w') # conversion tab for population to integer pop_to_int=[] # write infos into the final output for line in my_open_info_file: parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line) # if its the first meet with the population if parse_line.groups(1)[1] not in pop_to_int: pop_to_int.append(parse_line.groups(1)[1]) # manage ext if present, because the population map file should not have the ext if re.search("\.fq", fq_name_for_barcode[parse_line.groups(1)[0]]) or re.search("\.fastq", sam_name_for_barcode[parse_line.groups(1)[0]]): fqfile=os.path.splitext(fq_name_for_barcode[parse_line.groups(1)[0]])[0] else: fqfile=fq_name_for_barcode[parse_line.groups(1)[0]] # write in the file my_output_file.write(fqfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") # close files my_output_file.close() my_open_info_file.close() def generate_popmap_for_refmap(tab_fq_files, tab_sam_files, infos_file, pop_map): # initiate the dict : barcode -> tab[seq] seq_id_for_barcode={} # initiate the dict : barcode -> sam_name sam_name_for_barcode={} ### Parse fastqfiles ### # insert my barcode into a tab with sequences ID associated for fastq_file in tab_fq_files.keys(): single_barcode=re.search("([ATCG]*)\.fq", fastq_file).groups(0)[0] # open the fasq file open_fastqfile=open(tab_fq_files[fastq_file], 'r') # for each line, get the seq ID tab_seq_id=[] for line in open_fastqfile: my_match_seqID=re.search("^@([A-Z0-9]+\.[0-9]+)\s.*", line) if my_match_seqID: tab_seq_id.append(my_match_seqID.groups(0)[0]) # push in a dict the tab of seqID for the current barcode seq_id_for_barcode[single_barcode]=tab_seq_id ### Parse samfiles and get the first seq id ### for sam_file in tab_sam_files.keys(): # open the sam file open_samfile=open(tab_sam_files[sam_file], 'r') # get the first seq id first_seq_id='' for line in open_samfile: if not re.search("^@", line): first_seq_id=line.split("\t")[0] break # map with seq_id_for_barcode structure for barcode in seq_id_for_barcode: for seq in seq_id_for_barcode[barcode]: if seq == first_seq_id: #print "sam -> "+sam_file+" seq -> "+first_seq_id+" barcode -> "+barcode sam_name_for_barcode[barcode]=sam_file break # open the infos file and output file my_open_info_file=open(infos_file, 'r') my_output_file=open(pop_map, 'w') # conversion tab for population to integer pop_to_int=[] # write infos into the final output for line in my_open_info_file: parse_line=re.search("(^[ATCG]+)\t(.*)\t.*", line) # if its the first meet with the population if parse_line.groups(1)[1] not in pop_to_int: pop_to_int.append(parse_line.groups(1)[1]) # manage ext if present, because the population map file should not have the ext if re.search("\.sam", sam_name_for_barcode[parse_line.groups(1)[0]]): samfile=os.path.splitext(sam_name_for_barcode[parse_line.groups(1)[0]])[0] else: samfile=sam_name_for_barcode[parse_line.groups(1)[0]] # write in the file my_output_file.write(samfile+"\t"+str(pop_to_int.index(parse_line.groups(1)[1]))+"\n") # close files my_output_file.close() my_open_info_file.close() """ STACKS POPULATION """ def extract_compress_files(myfile, tmp_input_dir): #test if is zip file if (check_zip( myfile )): # extract all files names and added it in the tab myarchive = zipfile.ZipFile(myfile, 'r') # extract all files myarchive.extractall(tmp_input_dir) #test if is tar.gz file else: # extract all files names and added it in the tab mygzfile = tarfile.open(myfile, 'r') # extract all files mygzfile.extractall(tmp_input_dir)