Mercurial > repos > public-health-bioinformatics > assign_clades
view assign_clades.py @ 0:a971083404a4 draft default tip
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
author | public-health-bioinformatics |
---|---|
date | Thu, 04 Jul 2019 19:34:32 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python '''Accepts fasta files containing amino acid sequence, reading them in as amino acid sequence objects. Reads influenza clade defintions (i.e. amino acids at certain positions) from .csv file into dictionary structure. Searches each of the amino acid sequence objects for a list of matching clades, assigns the most 'evolved' (i.e. child as opposed to parent clade) to the sequence. Appends "_cladename" to the Sequence name and generates a fasta file of original sequences with modified names.''' '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Oct 2017''' import sys,string,os, time, Bio from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC from Bio.Seq import Seq localtime = time.asctime(time.localtime(time.time())) #date and time of analysis inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed inFileHandle2 = sys.argv[2] # .csv file containing clade definitions and "depth" outFileHandle = sys.argv[3] #user-specified name for output file of aa seq's with clade suffixes outFile= open(outFileHandle,'w') #open a writable, appendable output file seqList = [] #list of aa sequence objects to parse for clade definitions cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..}) '''Searches record for required amino acids at defined positions. If found, assigns clade name to sequence name by appending underscore and clade name to record id.''' def call_clade(record): print("---------------------------------------------------------------------") print("Parsing %s for matching flu clade definitions..." % (record.id)) matchList = [] #empty list to hold clades that match 100% #iterate over each tuple in the clade list for clade in cladeList: cladeName = clade[0] #temp variable for name depth = clade[1] #temp variable for depth sites = clade[2] #temp variable for aa def dictionary shouldFind = len(sites) #number of sites that should match found = 0 #a counter to hold matches to antigenic sites #iterate over each position in sites dictionary for pos, aa in sites.items(): #translate pos to corresponding index in target sequence index = int(pos) - 1 #if record at index has same amino acid as 'aa', increment 'found' if record[index] == aa: found += 1 if (found == shouldFind): #add the matching clade tuple to the list of matches matchList.append(clade) return matchList '''Compares depth level of clades in a list and returns the most granular one''' def decide_clade_by_depth(matchList): #empty variable for maximum depth encountered max_depth = 0 best_match_name = '' #variable to hold most granular clade #for each matching clade, check depth of the corresponding tuple for clade in matchList: #if the current clade is 'deeper' than the one before it if clade[1] > max_depth: #store this depth max_depth = clade[1] #store name of the clade best_match_name = clade[0] return best_match_name '''opens the .csv file of clade definitions and clade "depth" ''' with open (inFileHandle2, 'r') as clade_file: #remove whitespace from the end of each line and split elements at commas for line in clade_file: #print "Current Line in File:" + line sites={} #initialize a dictionary for clade elementList = line.rstrip().split(',') new_list = [] #start a new list to put non-empty strings into #remove empty stings in list for item in elementList: if item != '': new_list.append(item) name = new_list.pop(0) #move 1st element to name field depth = int(new_list.pop(0)) #move 2nd element to depth field #read remaining pairs of non-null elements into clade def dictionary for i in range(0, len(new_list), 2): #move next 2 items from the list into the dict pos = new_list[i] aa = new_list[i + 1] sites[pos] = aa #add the clade info as a tuple to the cladeList[] oneClade =(name, depth, sites) cladeList.append(oneClade) print("The List of Clades:") for clade in cladeList: print("Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2]))) for pos, aa in clade[2].items(): print("Pos: %s\tAA: %s" % (pos,aa)) '''opens readable input file of sequences to parse using filename from cmd line, instantiates as AA Sequence objects, with ppercase sequences''' with open(inFileHandle1,'r') as inFile: #read in Sequences from fasta file, uppercase and add to seqList for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein): record = record.upper() seqList.append(record) #add Seq to list of Sequences print("\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList)) #parse each target sequence object for record in seqList: clade_call = '' #empty variale for final clade call on sequence matchingCladeList = call_clade(record) #holds matching clade tuples #if there is more than one clade match if len(matchingCladeList) > 1: #choose the most granular clade based on depth clade_call = decide_clade_by_depth(matchingCladeList) #if there is only one clade call elif len(matchingCladeList) > 0: clade = matchingCladeList[0] #take the first tuple in the list clade_call = clade[0] #clade name is the first item in the tuple #empty list return, no matches else: clade_call = "No_Match" print(clade_call) seq_name = record.id mod_name = seq_name + "_" + clade_call print("New Sequence Name: " + mod_name) record.id = mod_name #output fasta file with clade calls appended to sequence names SeqIO.write(seqList,outFile,"fasta") #print("\n%i Sequences Extracted to Output file: %s" % ((len(extractedSeqList),outFileHandle))) inFile.close() clade_file.close() outFile.close()