Mercurial > repos > public-health-bioinformatics > aggregate_linelisting
comparison aggregate_linelisting.py @ 0:515c0c885f5d draft default tip
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
| author | public-health-bioinformatics |
|---|---|
| date | Thu, 04 Jul 2019 19:40:13 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:515c0c885f5d |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 '''Reads in a fasta file of antigenic maps and one with the reference antigenic map as | |
| 3 protein SeqRecords. Compares amino acids of sample antigenic maps to corresponding sites | |
| 4 in the reference and masks identical amino acids with dots. Writes headers (including | |
| 5 amino acid position numbers read from the respective index array), the reference amino | |
| 6 acid sequence and column headings required for both non-aggregated and aggregated line lists. | |
| 7 Outputs all headers and modified (i.e. dotted) sample sequences to a csv file.''' | |
| 8 | |
| 9 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Jan 2018''' | |
| 10 | |
| 11 import sys,string,os, time, Bio, re, argparse | |
| 12 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord | |
| 13 from Bio.SeqRecord import SeqRecord | |
| 14 from Bio.Alphabet import IUPAC | |
| 15 from Bio.Seq import Seq | |
| 16 | |
| 17 inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences | |
| 18 refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence | |
| 19 antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file | |
| 20 cladeDefinitionFile = sys.argv[4] #clade definition csv file | |
| 21 outFileHandle = sys.argv[5] #user-specifed output filename | |
| 22 | |
| 23 agg_lineListFile = open(outFileHandle,'w') #open a writable output file | |
| 24 | |
| 25 indicesLine = "" #comma-separated antigenic site positions | |
| 26 cladeList = [] #list of clade names read from clade definition file | |
| 27 ref_seq = "" #reference antigenic map (protein sequence) | |
| 28 seqList = [] #list of aa sequences to compare to reference | |
| 29 | |
| 30 BC_list = [] #empty list for BC samples | |
| 31 AB_list = [] #empty list for AB samples | |
| 32 ON_list = [] #empty list for ON samples | |
| 33 QC_list = [] #empty list for QC samples | |
| 34 nonprov_list = [] #empty list for samples not in above 4 provinces | |
| 35 #dictionary for location-separated sequence lists | |
| 36 prov_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list} | |
| 37 | |
| 38 def replace_matching_aa_with_dot(record): | |
| 39 """Compare amino acids in record to reference, mask identical symbols with dots, and return modified record.""" | |
| 40 orig_seq = str(record.seq) #sequence string from SeqRecord | |
| 41 mod_seq = "" | |
| 42 #replace only those aa's matching the reference with dots | |
| 43 for i in range(0, len(orig_seq)): | |
| 44 if (orig_seq[i] == ref_seq[i]): | |
| 45 mod_seq = mod_seq + '.' | |
| 46 else: | |
| 47 mod_seq = mod_seq + orig_seq[i] | |
| 48 #assign modified sequence to new SeqRecord and return it | |
| 49 rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "") | |
| 50 return rec | |
| 51 | |
| 52 def extract_clade(record): | |
| 53 """Extract clade name (or 'No_Match') from sequence name and return as clade name. """ | |
| 54 if record.id.endswith('No_Match'): | |
| 55 clade_name = 'No_Match' | |
| 56 else: # | |
| 57 for clade in cladeList: | |
| 58 if record.id.endswith(clade): | |
| 59 clade_name = clade | |
| 60 return clade_name | |
| 61 | |
| 62 def extract_sample_name(record, clade): | |
| 63 """Extract sample name from sequence name and return sample name. """ | |
| 64 end_index = record.id.index(clade) | |
| 65 sample_name = record.id[:end_index -1] | |
| 66 #return sample name as sequence name minus underscore and clade name | |
| 67 return sample_name | |
| 68 | |
| 69 def sort_by_location(record): | |
| 70 """Search sequence name for province name or 2-letter province code and add SeqRecord to | |
| 71 province-specific dictionary.""" | |
| 72 seq_name = record.id | |
| 73 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): | |
| 74 BC_list.append(record) #add Sequence record to BC_list | |
| 75 elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name): | |
| 76 AB_list.append(record) #add Sequence record to AB_list | |
| 77 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): | |
| 78 ON_list.append(record) #add Sequence record to ON_list | |
| 79 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): | |
| 80 QC_list.append(record) #add Sequence record to QC_list | |
| 81 else: | |
| 82 nonprov_list.append(record) #add Sequence record to nonprov_list | |
| 83 return | |
| 84 | |
| 85 def extract_province(record): | |
| 86 """Search sequence name for province name or 2-letter province code and return province.""" | |
| 87 seq_name = record.id | |
| 88 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): | |
| 89 province = 'British Columbia' | |
| 90 elif ('-AB-' in seq_name) or ('Alberta' in seq_name): | |
| 91 province = '/Alberta/' | |
| 92 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): | |
| 93 province = 'Ontario' | |
| 94 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): | |
| 95 province = 'Quebec' | |
| 96 else: | |
| 97 province = "other" | |
| 98 return province | |
| 99 | |
| 100 def get_sequence_length(record): | |
| 101 """Return length of sequence in a SeqRecord.""" | |
| 102 sequenceLength = len(str((record.seq))) | |
| 103 return sequenceLength | |
| 104 | |
| 105 def get_antigenic_site_substitutions(record): | |
| 106 """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions.""" | |
| 107 sequenceLength = get_sequence_length(record) | |
| 108 seqString = str(record.seq) | |
| 109 matches = seqString.count('.') | |
| 110 substitutions = sequenceLength - matches | |
| 111 return substitutions | |
| 112 | |
| 113 def calculate_percent_id(record, substitutions): | |
| 114 """Calculate percent sequence identity to reference sequence, based on substitutions | |
| 115 and sequence length and return percent id as a ratio (i.e. 0.90 no 90%).""" | |
| 116 sequenceLength = get_sequence_length(record) | |
| 117 percentID = (1.00 - (float(substitutions)/float(sequenceLength))) | |
| 118 return percentID | |
| 119 | |
| 120 def output_aggregated_linelist(a_list): | |
| 121 """Output aggregated line list of SeqRecords in csv format.""" | |
| 122 sequevars = {} #dict of sequevar: SeqRecord list | |
| 123 firstRecordID = None | |
| 124 #examine dotted/masked sequences in list and assign unique ones as dict keys | |
| 125 for rec in a_list: | |
| 126 rec = replace_matching_aa_with_dot(rec) | |
| 127 sequence =str(rec.seq) | |
| 128 #if the sequence is a key in the dict, add SeqRecord to list | |
| 129 if sequence in sequevars: | |
| 130 #if sequence already in dict as a key, increment the value | |
| 131 sequevars[sequence].append(rec) | |
| 132 else: | |
| 133 #if sequence not in dict, add is as new key with list of 1 SeqRecord | |
| 134 sequevars[sequence] = [rec] | |
| 135 #get list of sorted unique sequence keys | |
| 136 sorted_unique_seq_keys = sorted(sequevars.keys()) | |
| 137 #process each list of SeqRecords sharing a unique sequence | |
| 138 for u in sorted_unique_seq_keys: | |
| 139 #access list of sequences by unique sequence | |
| 140 listOfSeqs = sequevars[u] | |
| 141 #sort this list of SeqRecords by record.id (i.e. name) | |
| 142 listOfSeqs = [f for f in sorted(listOfSeqs, key = lambda x : x.id)] | |
| 143 N = len(listOfSeqs) | |
| 144 #output details of first SeqRecord to csv | |
| 145 firstRecord = listOfSeqs[0] | |
| 146 province = extract_province(firstRecord) | |
| 147 clade = extract_clade(firstRecord) | |
| 148 substitutions = get_antigenic_site_substitutions(firstRecord) | |
| 149 percentID = calculate_percent_id(firstRecord,substitutions) | |
| 150 name = extract_sample_name(firstRecord, clade) | |
| 151 name_part = name.rstrip() + ',' | |
| 152 N_part = str(N) + ',' | |
| 153 clade_part = clade + ',' | |
| 154 substitutions_part = str(substitutions) + ',' | |
| 155 percID_part = str(percentID) + ',' | |
| 156 col = " ," #empty column | |
| 157 sequence = str(firstRecord.seq).strip() | |
| 158 csv_seq = ",".join(sequence) +"," | |
| 159 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" | |
| 160 #write first member of unique sequence list to csv | |
| 161 agg_lineListFile.write(comma_sep_output) | |
| 162 #print sequence records in sequevar to console | |
| 163 print("\n\t\t%i SeqRecords matching Sequevar: %s" % (len(listOfSeqs), u)) | |
| 164 | |
| 165 #to uncollapse sequevar group, print each member of the sequevar list to csv output | |
| 166 '''for i in range(1,len(listOfSeqs)): | |
| 167 currentRec = listOfSeqs[i] | |
| 168 province = extract_province(currentRec) | |
| 169 clade = extract_clade(currentRec) | |
| 170 substitutions = get_antigenic_site_substitutions(currentRec) | |
| 171 percentID = calculate_percent_id(currentRec,substitutions) | |
| 172 name_part = (currentRec.id).rstrip() + ',' | |
| 173 N_part = "n/a" + ',' | |
| 174 clade_part = clade + ',' | |
| 175 substitutions_part = str(substitutions) + ',' | |
| 176 percID_part = str(percentID) + ',' | |
| 177 col = " ," #empty column | |
| 178 sequence = str(currentRec.seq).strip() | |
| 179 csv_seq = ",".join(sequence) +"," | |
| 180 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" | |
| 181 agg_lineListFile.write(comma_sep_output) ''' | |
| 182 return | |
| 183 | |
| 184 with open (antigenicSiteIndexArray,'r') as siteIndices: | |
| 185 """Read amino acid positions from antigenic site index array and print as header after one empty row.""" | |
| 186 col = "," #empty column | |
| 187 #read amino acid positions and remove trailing whitespace | |
| 188 for line in siteIndices: | |
| 189 #remove whitespace from the end of each line | |
| 190 indicesLine = line.rstrip() | |
| 191 row1 = "\n" | |
| 192 #add comma-separated AA positions to header line | |
| 193 row2 = col + col + col + col + indicesLine + "\n" | |
| 194 #write first (empty) and 2nd (amino acid position) lines to output file | |
| 195 agg_lineListFile.write(row1) | |
| 196 agg_lineListFile.write(row2) | |
| 197 | |
| 198 with open (refAntigenicMap,'r') as refMapFile: | |
| 199 """Read reference antigenic map from fasta and output amino acids, followed by column headers.""" | |
| 200 #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq | |
| 201 record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein) | |
| 202 record = record.upper() | |
| 203 ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs | |
| 204 col = "," #empty column | |
| 205 name_part = (record.id).rstrip() + ',' | |
| 206 sequence = str(record.seq).strip() | |
| 207 csv_seq = ",".join(sequence) | |
| 208 #output row with reference sequence displayed above sample sequences | |
| 209 row3 = name_part + col + col + col + csv_seq + "\n" | |
| 210 agg_lineListFile.write(row3) | |
| 211 positions = indicesLine.split(',') | |
| 212 numPos = len(positions) | |
| 213 empty_indicesLine = ',' * numPos | |
| 214 #print column headers for sample sequences | |
| 215 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" | |
| 216 agg_lineListFile.write(row4) | |
| 217 print("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) | |
| 218 | |
| 219 with open(cladeDefinitionFile,'r') as cladeFile: | |
| 220 """Read clade definition file and store clade names in list.""" | |
| 221 #remove whitespace from the end of each line and split elements at commas | |
| 222 for line in cladeFile: | |
| 223 elementList = line.rstrip().split(',') | |
| 224 name = elementList[0] #move 1st element to name field | |
| 225 cladeList.append(name) | |
| 226 | |
| 227 with open(inputAntigenicMaps,'r') as extrAntigMapFile: | |
| 228 """Read antigenic maps as protein SeqRecords and add to list.""" | |
| 229 #read Sequences from fasta file, uppercase and add to seqList | |
| 230 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): | |
| 231 record = record.upper() | |
| 232 seqList.append(record) #add Seq to list of Sequences | |
| 233 | |
| 234 #print number of sequences to be processed as user check | |
| 235 print("\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList)) | |
| 236 for record in seqList: | |
| 237 #assign SeqRecords to province-specific dictionaries | |
| 238 sort_by_location(record) | |
| 239 | |
| 240 #access prov segregated lists in order | |
| 241 sorted_prov_keys = sorted(prov_lists.keys()) | |
| 242 print("\nSequence Lists Sorted by Province: ") | |
| 243 for prov in sorted_prov_keys: | |
| 244 current_list = prov_lists[prov] | |
| 245 #mask AA's identical to reference sequence with dot | |
| 246 masked_list = [] # empty temporary list to park masked sequences | |
| 247 for record in current_list: | |
| 248 masked_rec = replace_matching_aa_with_dot(record) | |
| 249 masked_list.append(masked_rec) | |
| 250 prov_lists[prov] = masked_list #replace original SeqRecord list with masked list | |
| 251 | |
| 252 #group sequences in province-sorted list into clades | |
| 253 for prov in sorted_prov_keys: | |
| 254 prov_list = prov_lists[prov] | |
| 255 by_clades_dict = {} #empty dict for clade:seqRecord list groups | |
| 256 print("\n'%s' List (Amino Acids identical to Reference are Masked): " % (prov)) | |
| 257 for rec in prov_list: | |
| 258 clade = extract_clade(rec) | |
| 259 if clade in by_clades_dict: | |
| 260 #if clade already in dict as key, append record to list (value) | |
| 261 by_clades_dict[clade].append(rec) | |
| 262 else: #add clade as key to dict, value is list of 1 SeqRecord | |
| 263 by_clades_dict[clade] = [rec] | |
| 264 #get list of alphabetically sorted clade keys | |
| 265 sorted_clade_keys = sorted(by_clades_dict.keys()) | |
| 266 print("\tNumber of clades: ", len(by_clades_dict)) | |
| 267 #group each list of sequences in clade by sequevars | |
| 268 for key in sorted_clade_keys: | |
| 269 print("\n\tCLADE: %s Number of Members: %i" % (key, len(by_clades_dict[key]))) | |
| 270 a_list = by_clades_dict[key] | |
| 271 for seqrec in a_list: | |
| 272 print("\t %s: %s" %(seqrec.id,str(seqrec.seq))) | |
| 273 #output the list to csv as aggregated linelist | |
| 274 output_aggregated_linelist(a_list) | |
| 275 | |
| 276 print("Aggregated Linelist written to file: '%s\n'" % (outFileHandle)) | |
| 277 extrAntigMapFile.close() | |
| 278 refMapFile.close() | |
| 279 agg_lineListFile.close() |
