Mercurial > repos > public-health-bioinformatics > linelisting
comparison linelisting.py @ 0:be856549e863 draft default tip
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
author | public-health-bioinformatics |
---|---|
date | Thu, 04 Jul 2019 19:37:41 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:be856549e863 |
---|---|
1 #!/usr/bin/env python | |
2 '''Reads in a fasta file of extracted antigenic sites and one containing a | |
3 reference flu antigenic map, reading them in as protein SeqRecords. Compares each amino | |
4 acid of each sample antigenic map to corresponding sites in the reference and replaces | |
5 identical amino acids with dots. Writes headers (including amino acid position numbers | |
6 read in from the respective index array), the reference amino acid sequence and column | |
7 headings required for non-aggregated line lists. Outputs headers and modified (i.e. dotted) | |
8 sequences to a csv file.''' | |
9 | |
10 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Nov 2017''' | |
11 | |
12 import sys,string,os, time, Bio, re, argparse | |
13 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord | |
14 from Bio.SeqRecord import SeqRecord | |
15 from Bio.Alphabet import IUPAC | |
16 from Bio.Seq import Seq | |
17 | |
18 inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences | |
19 refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence | |
20 antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file | |
21 cladeDefinitionFile = sys.argv[4] #clade definition csv file | |
22 outFileHandle = sys.argv[5] #user-specifed output filename | |
23 | |
24 lineListFile = open(outFileHandle,'w') #open a writable output file | |
25 | |
26 indicesLine = "" #comma-separated antigenic site positions | |
27 cladeList = [] #list of clade names read from clade definition file | |
28 ref_seq = "" #reference antigenic map (protein sequence) | |
29 seqList = [] #list of aa sequences to compare to reference | |
30 | |
31 BC_list = [] #empty list for BC samples | |
32 AB_list = [] #empty list for AB samples | |
33 ON_list = [] #empty list for ON samples | |
34 QC_list = [] #empty list for QC samples | |
35 nonprov_list = [] #empty list for samples not in above 4 provinces | |
36 #dictionary for location-separated sequence lists | |
37 segregated_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list} | |
38 uniqueSeqs = {} #empty dict with unique seqs as keys and lists of SeqRecords as values | |
39 | |
40 def replace_matching_aa_with_dot(record): | |
41 """Compare amino acids in record to reference sequence, replace matching symbols | |
42 with dots, and return record with modified amino acid sequence.""" | |
43 orig_seq = str(record.seq) #get sequence string from SeqRecord | |
44 mod_seq = "" | |
45 #replace only those aa's matching the reference with dots | |
46 for i in range(0, len(orig_seq)): | |
47 if (orig_seq[i] == ref_seq[i]): | |
48 mod_seq = mod_seq + '.' | |
49 else: | |
50 mod_seq = mod_seq + orig_seq[i] | |
51 #assign modified sequence to new SeqRecord and return it | |
52 rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "") | |
53 return rec | |
54 | |
55 def extract_clade(record): | |
56 """Extract clade name (or 'No_Match') from sequence name and return as clade name. """ | |
57 if record.id.endswith('No_Match'): | |
58 clade_name = 'No_Match' | |
59 end_index = record.id.index(clade_name) | |
60 record.id = record.id[:end_index -1] | |
61 return clade_name | |
62 else: # | |
63 for clade in cladeList: | |
64 if record.id.endswith(clade): | |
65 clade_name = clade | |
66 end_index = record.id.index(clade) | |
67 record.id = record.id[:end_index -1] | |
68 return clade_name | |
69 | |
70 def sort_by_location(record): | |
71 """Search sequence name for province name or 2 letter province code and add SeqRecord to | |
72 province-specific dictionary.""" | |
73 seq_name = record.id | |
74 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): | |
75 BC_list.append(record) #add Sequence record to BC_list | |
76 elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name): | |
77 AB_list.append(record) #add Sequence record to AB_list | |
78 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): | |
79 ON_list.append(record) #add Sequence record to ON_list | |
80 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): | |
81 QC_list.append(record) #add Sequence record to QC_list | |
82 else: | |
83 nonprov_list.append(record) #add Sequence record to nonprov_list | |
84 return | |
85 | |
86 def extract_province(record): | |
87 """Search sequence name for province name or 2 letter province code and return province.""" | |
88 seq_name = record.id | |
89 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): | |
90 province = 'British Columbia' | |
91 elif ('-AB-' in seq_name) or ('Alberta' in seq_name): | |
92 province = '/Alberta/' | |
93 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): | |
94 province = 'Ontario' | |
95 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): | |
96 province = 'Quebec' | |
97 else: | |
98 province = "other" | |
99 return province | |
100 | |
101 def get_sequence_length(record): | |
102 """Return the length of a sequence in a Sequence record.""" | |
103 sequenceLength = len(str((record.seq))) | |
104 return sequenceLength | |
105 | |
106 def get_antigenic_site_substitutions(record): | |
107 """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions.""" | |
108 sequenceLength = get_sequence_length(record) | |
109 seqString = str(record.seq) | |
110 matches = seqString.count('.') | |
111 substitutions = sequenceLength - matches | |
112 return substitutions | |
113 | |
114 def calculate_percent_id(record, substitutions): | |
115 """Calculate sequence identity to a reference (based on substitutions and sequence length) and return percent id.""" | |
116 sequenceLength = get_sequence_length(record) | |
117 percentID = (1.00 - (float(substitutions)/float(sequenceLength))) | |
118 return percentID | |
119 | |
120 def output_linelist(sequenceList): | |
121 """Output a list of SeqRecords to a non-aggregated line list in csv format.""" | |
122 for record in sequenceList: | |
123 #get province, clade from sequence record | |
124 province = extract_province(record) | |
125 clade = extract_clade(record) | |
126 #calculate number of substitutions and % id to reference | |
127 substitutions = get_antigenic_site_substitutions(record) | |
128 percentID = calculate_percent_id(record,substitutions) | |
129 name_part = (record.id).rstrip() + ',' | |
130 clade_part = clade + ',' | |
131 substitutions_part = str(substitutions) + ',' | |
132 percID_part = str(percentID) + ',' | |
133 col = " ," #empty column | |
134 sequence = str(record.seq).strip() | |
135 csv_seq = ",".join(sequence) +"," | |
136 #write linelisted antigenic maps to csv file | |
137 comma_sep_line = name_part + col + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" | |
138 lineListFile.write(comma_sep_line) | |
139 return | |
140 | |
141 with open (antigenicSiteIndexArray,'r') as siteIndices: | |
142 """Read amino acid positions from antigenic site index array and print as header after one empty row.""" | |
143 col = "," #empty column | |
144 #read items separated by comma's to position list | |
145 for line in siteIndices: | |
146 #remove whitespace from the end of each line | |
147 indicesLine = line.rstrip() | |
148 row1 = "\n" | |
149 #add comma-separated AA positions to header line | |
150 row2 = col + col + col + col + indicesLine + "\n" | |
151 #write first (empty) and 2nd (amino acid position) lines to linelist output file | |
152 lineListFile.write(row1) | |
153 lineListFile.write(row2) | |
154 | |
155 with open (refAntigenicMap,'r') as refMapFile: | |
156 """Read reference antigenic map from fasta and output amino acids, followed by column headers.""" | |
157 #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq | |
158 record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein) | |
159 record = record.upper() | |
160 ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs | |
161 col = "," #empty column | |
162 name_part = (record.id).rstrip() + ',' | |
163 sequence = str(record.seq).strip() | |
164 csv_seq = ",".join(sequence) | |
165 #output row with reference sequence displayed above sample sequences | |
166 row3 = name_part + col + col + col + csv_seq + "\n" | |
167 lineListFile.write(row3) | |
168 #replaces digits in the indicesLine with empty strings | |
169 positions = indicesLine.split(',') | |
170 numPos = len(positions) | |
171 empty_indicesLine = ',' * numPos | |
172 #print column headers for sample sequences | |
173 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" | |
174 lineListFile.write(row4) | |
175 print("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) | |
176 | |
177 with open(cladeDefinitionFile,'r') as cladeFile: | |
178 """Read clade definition file and store clade names in a list.""" | |
179 #remove whitespace from the end of each line and split elements at commas | |
180 for line in cladeFile: | |
181 elementList = line.rstrip().split(',') | |
182 name = elementList[0] #move 1st element to name field | |
183 cladeList.append(name) | |
184 | |
185 with open(inputAntigenicMaps,'r') as extrAntigMapFile: | |
186 """Read antigenic maps as protein SeqRecords and add to list.""" | |
187 #read Sequences from fasta file, uppercase and add to seqList | |
188 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): | |
189 record = record.upper() | |
190 seqList.append(record) #add Seq to list of Sequences | |
191 | |
192 #print number of sequences to be process as user check | |
193 print("\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList)) | |
194 #parse each antigenic map sequence object | |
195 for record in seqList: | |
196 #assign Sequence to dictionaries according to location in name | |
197 sort_by_location(record) | |
198 #sort dictionary keys that access province-segregated lists | |
199 sorted_segregated_list_keys = sorted(segregated_lists.keys()) | |
200 print("\nSequence Lists Sorted by Province: ") | |
201 #process each province-segregated SeqRecord list | |
202 for listname in sorted_segregated_list_keys: | |
203 #acesss list of sequences by the listname key | |
204 a_list = segregated_lists[listname] | |
205 # sort original SeqRecords by record id (i.e. name) | |
206 a_list = [f for f in sorted(a_list, key = lambda x : x.id)] | |
207 mod_list = [] # empty temporary list | |
208 for record in a_list: | |
209 #replace matching amino acid symbols with dots | |
210 rec = replace_matching_aa_with_dot(record) | |
211 mod_list.append(rec) #populate a list of modified records | |
212 segregated_lists[listname] = mod_list | |
213 print("\n'%s' List (Amino Acids identical to Reference Masked): " % (listname)) | |
214 #output the list to csv as non-aggregated linelist | |
215 output_linelist(segregated_lists[listname]) | |
216 | |
217 extrAntigMapFile.close() | |
218 refMapFile.close() | |
219 lineListFile.close() |