comparison assign_clades.py @ 0:a971083404a4 draft default tip

planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
author public-health-bioinformatics
date Thu, 04 Jul 2019 19:34:32 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a971083404a4
1 #!/usr/bin/env python
2
3 '''Accepts fasta files containing amino acid sequence, reading them in as
4 amino acid sequence objects. Reads influenza clade defintions (i.e. amino
5 acids at certain positions) from .csv file into dictionary structure. Searches
6 each of the amino acid sequence objects for a list of matching clades, assigns
7 the most 'evolved' (i.e. child as opposed to parent clade) to the sequence. Appends
8 "_cladename" to the Sequence name and generates a fasta file of original sequences with
9 modified names.'''
10
11 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Oct 2017'''
12
13 import sys,string,os, time, Bio
14 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord
15 from Bio.SeqRecord import SeqRecord
16 from Bio.Alphabet import IUPAC
17 from Bio.Seq import Seq
18
19 localtime = time.asctime(time.localtime(time.time())) #date and time of analysis
20 inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed
21 inFileHandle2 = sys.argv[2] # .csv file containing clade definitions and "depth"
22 outFileHandle = sys.argv[3] #user-specified name for output file of aa seq's with clade suffixes
23 outFile= open(outFileHandle,'w') #open a writable, appendable output file
24 seqList = [] #list of aa sequence objects to parse for clade definitions
25 cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..})
26
27 '''Searches record for required amino acids at defined positions. If found, assigns
28 clade name to sequence name by appending underscore and clade name to record id.'''
29 def call_clade(record):
30 print("---------------------------------------------------------------------")
31 print("Parsing %s for matching flu clade definitions..." % (record.id))
32 matchList = [] #empty list to hold clades that match 100%
33 #iterate over each tuple in the clade list
34 for clade in cladeList:
35 cladeName = clade[0] #temp variable for name
36 depth = clade[1] #temp variable for depth
37 sites = clade[2] #temp variable for aa def dictionary
38 shouldFind = len(sites) #number of sites that should match
39 found = 0 #a counter to hold matches to antigenic sites
40 #iterate over each position in sites dictionary
41 for pos, aa in sites.items():
42 #translate pos to corresponding index in target sequence
43 index = int(pos) - 1
44 #if record at index has same amino acid as 'aa', increment 'found'
45 if record[index] == aa:
46 found += 1
47 if (found == shouldFind):
48 #add the matching clade tuple to the list of matches
49 matchList.append(clade)
50 return matchList
51
52 '''Compares depth level of clades in a list and returns the most granular one'''
53 def decide_clade_by_depth(matchList):
54 #empty variable for maximum depth encountered
55 max_depth = 0
56 best_match_name = '' #variable to hold most granular clade
57 #for each matching clade, check depth of the corresponding tuple
58 for clade in matchList:
59 #if the current clade is 'deeper' than the one before it
60 if clade[1] > max_depth:
61 #store this depth
62 max_depth = clade[1]
63 #store name of the clade
64 best_match_name = clade[0]
65 return best_match_name
66
67 '''opens the .csv file of clade definitions and clade "depth" '''
68 with open (inFileHandle2, 'r') as clade_file:
69 #remove whitespace from the end of each line and split elements at commas
70 for line in clade_file:
71 #print "Current Line in File:" + line
72 sites={} #initialize a dictionary for clade
73 elementList = line.rstrip().split(',')
74 new_list = [] #start a new list to put non-empty strings into
75 #remove empty stings in list
76 for item in elementList:
77 if item != '':
78 new_list.append(item)
79 name = new_list.pop(0) #move 1st element to name field
80 depth = int(new_list.pop(0)) #move 2nd element to depth field
81 #read remaining pairs of non-null elements into clade def dictionary
82 for i in range(0, len(new_list), 2):
83 #move next 2 items from the list into the dict
84 pos = new_list[i]
85 aa = new_list[i + 1]
86 sites[pos] = aa
87 #add the clade info as a tuple to the cladeList[]
88 oneClade =(name, depth, sites)
89 cladeList.append(oneClade)
90 print("The List of Clades:")
91 for clade in cladeList:
92 print("Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2])))
93 for pos, aa in clade[2].items():
94 print("Pos: %s\tAA: %s" % (pos,aa))
95
96 '''opens readable input file of sequences to parse using filename from cmd line,
97 instantiates as AA Sequence objects, with ppercase sequences'''
98 with open(inFileHandle1,'r') as inFile:
99 #read in Sequences from fasta file, uppercase and add to seqList
100 for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein):
101 record = record.upper()
102 seqList.append(record) #add Seq to list of Sequences
103 print("\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList))
104 #parse each target sequence object
105 for record in seqList:
106 clade_call = '' #empty variale for final clade call on sequence
107 matchingCladeList = call_clade(record) #holds matching clade tuples
108 #if there is more than one clade match
109 if len(matchingCladeList) > 1:
110 #choose the most granular clade based on depth
111 clade_call = decide_clade_by_depth(matchingCladeList)
112 #if there is only one clade call
113 elif len(matchingCladeList) > 0:
114 clade = matchingCladeList[0] #take the first tuple in the list
115 clade_call = clade[0] #clade name is the first item in the tuple
116 #empty list return, no matches
117 else:
118 clade_call = "No_Match"
119 print(clade_call)
120 seq_name = record.id
121 mod_name = seq_name + "_" + clade_call
122 print("New Sequence Name: " + mod_name)
123 record.id = mod_name
124
125
126 #output fasta file with clade calls appended to sequence names
127 SeqIO.write(seqList,outFile,"fasta")
128
129 #print("\n%i Sequences Extracted to Output file: %s" % ((len(extractedSeqList),outFileHandle)))
130 inFile.close()
131 clade_file.close()
132 outFile.close()
133