Mercurial > repos > public-health-bioinformatics > linelisting
view linelisting.py @ 0:be856549e863 draft default tip
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit b96b6e06f6eaa6ae8ef4c24630dbb72a4aed7dbe
author | public-health-bioinformatics |
---|---|
date | Thu, 04 Jul 2019 19:37:41 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python '''Reads in a fasta file of extracted antigenic sites and one containing a reference flu antigenic map, reading them in as protein SeqRecords. Compares each amino acid of each sample antigenic map to corresponding sites in the reference and replaces identical amino acids with dots. Writes headers (including amino acid position numbers read in from the respective index array), the reference amino acid sequence and column headings required for non-aggregated line lists. Outputs headers and modified (i.e. dotted) sequences to a csv file.''' '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Nov 2017''' import sys,string,os, time, Bio, re, argparse from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC from Bio.Seq import Seq inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file cladeDefinitionFile = sys.argv[4] #clade definition csv file outFileHandle = sys.argv[5] #user-specifed output filename lineListFile = open(outFileHandle,'w') #open a writable output file indicesLine = "" #comma-separated antigenic site positions cladeList = [] #list of clade names read from clade definition file ref_seq = "" #reference antigenic map (protein sequence) seqList = [] #list of aa sequences to compare to reference BC_list = [] #empty list for BC samples AB_list = [] #empty list for AB samples ON_list = [] #empty list for ON samples QC_list = [] #empty list for QC samples nonprov_list = [] #empty list for samples not in above 4 provinces #dictionary for location-separated sequence lists segregated_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list} uniqueSeqs = {} #empty dict with unique seqs as keys and lists of SeqRecords as values def replace_matching_aa_with_dot(record): """Compare amino acids in record to reference sequence, replace matching symbols with dots, and return record with modified amino acid sequence.""" orig_seq = str(record.seq) #get sequence string from SeqRecord mod_seq = "" #replace only those aa's matching the reference with dots for i in range(0, len(orig_seq)): if (orig_seq[i] == ref_seq[i]): mod_seq = mod_seq + '.' else: mod_seq = mod_seq + orig_seq[i] #assign modified sequence to new SeqRecord and return it rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "") return rec def extract_clade(record): """Extract clade name (or 'No_Match') from sequence name and return as clade name. """ if record.id.endswith('No_Match'): clade_name = 'No_Match' end_index = record.id.index(clade_name) record.id = record.id[:end_index -1] return clade_name else: # for clade in cladeList: if record.id.endswith(clade): clade_name = clade end_index = record.id.index(clade) record.id = record.id[:end_index -1] return clade_name def sort_by_location(record): """Search sequence name for province name or 2 letter province code and add SeqRecord to province-specific dictionary.""" seq_name = record.id if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): BC_list.append(record) #add Sequence record to BC_list elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name): AB_list.append(record) #add Sequence record to AB_list elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): ON_list.append(record) #add Sequence record to ON_list elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): QC_list.append(record) #add Sequence record to QC_list else: nonprov_list.append(record) #add Sequence record to nonprov_list return def extract_province(record): """Search sequence name for province name or 2 letter province code and return province.""" seq_name = record.id if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): province = 'British Columbia' elif ('-AB-' in seq_name) or ('Alberta' in seq_name): province = '/Alberta/' elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): province = 'Ontario' elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): province = 'Quebec' else: province = "other" return province def get_sequence_length(record): """Return the length of a sequence in a Sequence record.""" sequenceLength = len(str((record.seq))) return sequenceLength def get_antigenic_site_substitutions(record): """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions.""" sequenceLength = get_sequence_length(record) seqString = str(record.seq) matches = seqString.count('.') substitutions = sequenceLength - matches return substitutions def calculate_percent_id(record, substitutions): """Calculate sequence identity to a reference (based on substitutions and sequence length) and return percent id.""" sequenceLength = get_sequence_length(record) percentID = (1.00 - (float(substitutions)/float(sequenceLength))) return percentID def output_linelist(sequenceList): """Output a list of SeqRecords to a non-aggregated line list in csv format.""" for record in sequenceList: #get province, clade from sequence record province = extract_province(record) clade = extract_clade(record) #calculate number of substitutions and % id to reference substitutions = get_antigenic_site_substitutions(record) percentID = calculate_percent_id(record,substitutions) name_part = (record.id).rstrip() + ',' clade_part = clade + ',' substitutions_part = str(substitutions) + ',' percID_part = str(percentID) + ',' col = " ," #empty column sequence = str(record.seq).strip() csv_seq = ",".join(sequence) +"," #write linelisted antigenic maps to csv file comma_sep_line = name_part + col + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" lineListFile.write(comma_sep_line) return with open (antigenicSiteIndexArray,'r') as siteIndices: """Read amino acid positions from antigenic site index array and print as header after one empty row.""" col = "," #empty column #read items separated by comma's to position list for line in siteIndices: #remove whitespace from the end of each line indicesLine = line.rstrip() row1 = "\n" #add comma-separated AA positions to header line row2 = col + col + col + col + indicesLine + "\n" #write first (empty) and 2nd (amino acid position) lines to linelist output file lineListFile.write(row1) lineListFile.write(row2) with open (refAntigenicMap,'r') as refMapFile: """Read reference antigenic map from fasta and output amino acids, followed by column headers.""" #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein) record = record.upper() ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs col = "," #empty column name_part = (record.id).rstrip() + ',' sequence = str(record.seq).strip() csv_seq = ",".join(sequence) #output row with reference sequence displayed above sample sequences row3 = name_part + col + col + col + csv_seq + "\n" lineListFile.write(row3) #replaces digits in the indicesLine with empty strings positions = indicesLine.split(',') numPos = len(positions) empty_indicesLine = ',' * numPos #print column headers for sample sequences row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" lineListFile.write(row4) print("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) with open(cladeDefinitionFile,'r') as cladeFile: """Read clade definition file and store clade names in a list.""" #remove whitespace from the end of each line and split elements at commas for line in cladeFile: elementList = line.rstrip().split(',') name = elementList[0] #move 1st element to name field cladeList.append(name) with open(inputAntigenicMaps,'r') as extrAntigMapFile: """Read antigenic maps as protein SeqRecords and add to list.""" #read Sequences from fasta file, uppercase and add to seqList for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): record = record.upper() seqList.append(record) #add Seq to list of Sequences #print number of sequences to be process as user check print("\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList)) #parse each antigenic map sequence object for record in seqList: #assign Sequence to dictionaries according to location in name sort_by_location(record) #sort dictionary keys that access province-segregated lists sorted_segregated_list_keys = sorted(segregated_lists.keys()) print("\nSequence Lists Sorted by Province: ") #process each province-segregated SeqRecord list for listname in sorted_segregated_list_keys: #acesss list of sequences by the listname key a_list = segregated_lists[listname] # sort original SeqRecords by record id (i.e. name) a_list = [f for f in sorted(a_list, key = lambda x : x.id)] mod_list = [] # empty temporary list for record in a_list: #replace matching amino acid symbols with dots rec = replace_matching_aa_with_dot(record) mod_list.append(rec) #populate a list of modified records segregated_lists[listname] = mod_list print("\n'%s' List (Amino Acids identical to Reference Masked): " % (listname)) #output the list to csv as non-aggregated linelist output_linelist(segregated_lists[listname]) extrAntigMapFile.close() refMapFile.close() lineListFile.close()