comparison linelisting.py @ 0:be856549e863 draft default tip

planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
author public-health-bioinformatics
date Thu, 04 Jul 2019 19:37:41 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:be856549e863
1 #!/usr/bin/env python
2 '''Reads in a fasta file of extracted antigenic sites and one containing a
3 reference flu antigenic map, reading them in as protein SeqRecords. Compares each amino
4 acid of each sample antigenic map to corresponding sites in the reference and replaces
5 identical amino acids with dots. Writes headers (including amino acid position numbers
6 read in from the respective index array), the reference amino acid sequence and column
7 headings required for non-aggregated line lists. Outputs headers and modified (i.e. dotted)
8 sequences to a csv file.'''
9
10 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Nov 2017'''
11
12 import sys,string,os, time, Bio, re, argparse
13 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord
14 from Bio.SeqRecord import SeqRecord
15 from Bio.Alphabet import IUPAC
16 from Bio.Seq import Seq
17
18 inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences
19 refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence
20 antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file
21 cladeDefinitionFile = sys.argv[4] #clade definition csv file
22 outFileHandle = sys.argv[5] #user-specifed output filename
23
24 lineListFile = open(outFileHandle,'w') #open a writable output file
25
26 indicesLine = "" #comma-separated antigenic site positions
27 cladeList = [] #list of clade names read from clade definition file
28 ref_seq = "" #reference antigenic map (protein sequence)
29 seqList = [] #list of aa sequences to compare to reference
30
31 BC_list = [] #empty list for BC samples
32 AB_list = [] #empty list for AB samples
33 ON_list = [] #empty list for ON samples
34 QC_list = [] #empty list for QC samples
35 nonprov_list = [] #empty list for samples not in above 4 provinces
36 #dictionary for location-separated sequence lists
37 segregated_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list}
38 uniqueSeqs = {} #empty dict with unique seqs as keys and lists of SeqRecords as values
39
40 def replace_matching_aa_with_dot(record):
41 """Compare amino acids in record to reference sequence, replace matching symbols
42 with dots, and return record with modified amino acid sequence."""
43 orig_seq = str(record.seq) #get sequence string from SeqRecord
44 mod_seq = ""
45 #replace only those aa's matching the reference with dots
46 for i in range(0, len(orig_seq)):
47 if (orig_seq[i] == ref_seq[i]):
48 mod_seq = mod_seq + '.'
49 else:
50 mod_seq = mod_seq + orig_seq[i]
51 #assign modified sequence to new SeqRecord and return it
52 rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "")
53 return rec
54
55 def extract_clade(record):
56 """Extract clade name (or 'No_Match') from sequence name and return as clade name. """
57 if record.id.endswith('No_Match'):
58 clade_name = 'No_Match'
59 end_index = record.id.index(clade_name)
60 record.id = record.id[:end_index -1]
61 return clade_name
62 else: #
63 for clade in cladeList:
64 if record.id.endswith(clade):
65 clade_name = clade
66 end_index = record.id.index(clade)
67 record.id = record.id[:end_index -1]
68 return clade_name
69
70 def sort_by_location(record):
71 """Search sequence name for province name or 2 letter province code and add SeqRecord to
72 province-specific dictionary."""
73 seq_name = record.id
74 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name):
75 BC_list.append(record) #add Sequence record to BC_list
76 elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name):
77 AB_list.append(record) #add Sequence record to AB_list
78 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name):
79 ON_list.append(record) #add Sequence record to ON_list
80 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name):
81 QC_list.append(record) #add Sequence record to QC_list
82 else:
83 nonprov_list.append(record) #add Sequence record to nonprov_list
84 return
85
86 def extract_province(record):
87 """Search sequence name for province name or 2 letter province code and return province."""
88 seq_name = record.id
89 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name):
90 province = 'British Columbia'
91 elif ('-AB-' in seq_name) or ('Alberta' in seq_name):
92 province = '/Alberta/'
93 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name):
94 province = 'Ontario'
95 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name):
96 province = 'Quebec'
97 else:
98 province = "other"
99 return province
100
101 def get_sequence_length(record):
102 """Return the length of a sequence in a Sequence record."""
103 sequenceLength = len(str((record.seq)))
104 return sequenceLength
105
106 def get_antigenic_site_substitutions(record):
107 """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions."""
108 sequenceLength = get_sequence_length(record)
109 seqString = str(record.seq)
110 matches = seqString.count('.')
111 substitutions = sequenceLength - matches
112 return substitutions
113
114 def calculate_percent_id(record, substitutions):
115 """Calculate sequence identity to a reference (based on substitutions and sequence length) and return percent id."""
116 sequenceLength = get_sequence_length(record)
117 percentID = (1.00 - (float(substitutions)/float(sequenceLength)))
118 return percentID
119
120 def output_linelist(sequenceList):
121 """Output a list of SeqRecords to a non-aggregated line list in csv format."""
122 for record in sequenceList:
123 #get province, clade from sequence record
124 province = extract_province(record)
125 clade = extract_clade(record)
126 #calculate number of substitutions and % id to reference
127 substitutions = get_antigenic_site_substitutions(record)
128 percentID = calculate_percent_id(record,substitutions)
129 name_part = (record.id).rstrip() + ','
130 clade_part = clade + ','
131 substitutions_part = str(substitutions) + ','
132 percID_part = str(percentID) + ','
133 col = " ," #empty column
134 sequence = str(record.seq).strip()
135 csv_seq = ",".join(sequence) +","
136 #write linelisted antigenic maps to csv file
137 comma_sep_line = name_part + col + clade_part + col + csv_seq + substitutions_part + percID_part + "\n"
138 lineListFile.write(comma_sep_line)
139 return
140
141 with open (antigenicSiteIndexArray,'r') as siteIndices:
142 """Read amino acid positions from antigenic site index array and print as header after one empty row."""
143 col = "," #empty column
144 #read items separated by comma's to position list
145 for line in siteIndices:
146 #remove whitespace from the end of each line
147 indicesLine = line.rstrip()
148 row1 = "\n"
149 #add comma-separated AA positions to header line
150 row2 = col + col + col + col + indicesLine + "\n"
151 #write first (empty) and 2nd (amino acid position) lines to linelist output file
152 lineListFile.write(row1)
153 lineListFile.write(row2)
154
155 with open (refAntigenicMap,'r') as refMapFile:
156 """Read reference antigenic map from fasta and output amino acids, followed by column headers."""
157 #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq
158 record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein)
159 record = record.upper()
160 ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs
161 col = "," #empty column
162 name_part = (record.id).rstrip() + ','
163 sequence = str(record.seq).strip()
164 csv_seq = ",".join(sequence)
165 #output row with reference sequence displayed above sample sequences
166 row3 = name_part + col + col + col + csv_seq + "\n"
167 lineListFile.write(row3)
168 #replaces digits in the indicesLine with empty strings
169 positions = indicesLine.split(',')
170 numPos = len(positions)
171 empty_indicesLine = ',' * numPos
172 #print column headers for sample sequences
173 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n"
174 lineListFile.write(row4)
175 print("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record)))
176
177 with open(cladeDefinitionFile,'r') as cladeFile:
178 """Read clade definition file and store clade names in a list."""
179 #remove whitespace from the end of each line and split elements at commas
180 for line in cladeFile:
181 elementList = line.rstrip().split(',')
182 name = elementList[0] #move 1st element to name field
183 cladeList.append(name)
184
185 with open(inputAntigenicMaps,'r') as extrAntigMapFile:
186 """Read antigenic maps as protein SeqRecords and add to list."""
187 #read Sequences from fasta file, uppercase and add to seqList
188 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein):
189 record = record.upper()
190 seqList.append(record) #add Seq to list of Sequences
191
192 #print number of sequences to be process as user check
193 print("\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList))
194 #parse each antigenic map sequence object
195 for record in seqList:
196 #assign Sequence to dictionaries according to location in name
197 sort_by_location(record)
198 #sort dictionary keys that access province-segregated lists
199 sorted_segregated_list_keys = sorted(segregated_lists.keys())
200 print("\nSequence Lists Sorted by Province: ")
201 #process each province-segregated SeqRecord list
202 for listname in sorted_segregated_list_keys:
203 #acesss list of sequences by the listname key
204 a_list = segregated_lists[listname]
205 # sort original SeqRecords by record id (i.e. name)
206 a_list = [f for f in sorted(a_list, key = lambda x : x.id)]
207 mod_list = [] # empty temporary list
208 for record in a_list:
209 #replace matching amino acid symbols with dots
210 rec = replace_matching_aa_with_dot(record)
211 mod_list.append(rec) #populate a list of modified records
212 segregated_lists[listname] = mod_list
213 print("\n'%s' List (Amino Acids identical to Reference Masked): " % (listname))
214 #output the list to csv as non-aggregated linelist
215 output_linelist(segregated_lists[listname])
216
217 extrAntigMapFile.close()
218 refMapFile.close()
219 lineListFile.close()