Join GitHub today
+GitHub is home to over 36 million developers working together to host and review code, manage projects, and build software together.
+ Sign up ++ VAPPER-Galaxy/Tryp_V.py +
+ ++ | """ | +
+ | * Copyright 2018 University of Liverpool | +
+ | * Author: John Heap, Computational Biology Facility, UoL | +
+ | * Based on original scripts of Sara Silva Pereira, Institute of Infection and Global Health, UoL | +
+ | * | +
+ | * Licensed under the Apache License, Version 2.0 (the "License"); | +
+ | * you may not use this file except in compliance with the License. | +
+ | * You may obtain a copy of the License at | +
+ | * | +
+ | * http://www.apache.org/licenses/LICENSE-2.0 | +
+ | * | +
+ | * Unless required by applicable law or agreed to in writing, software | +
+ | * distributed under the License is distributed on an "AS IS" BASIS, | +
+ | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | +
+ | * See the License for the specific language governing permissions and | +
+ | * limitations under the License. | +
+ | * | +
+ | """ | +
+ | + | +
+ | import matplotlib as mpl | +
+ | mpl.use('Agg') | +
+ | + | +
+ | import subprocess | +
+ | import shutil | +
+ | import re | +
+ | import pandas as pd | +
+ | import os | +
+ | + | +
+ | import sys | +
+ | + | +
+ | import matplotlib.pyplot as plt | +
+ | from matplotlib.patches import Patch | +
+ | import seaborn as sns | +
+ | + | +
+ | def assembleWithVelvet(name, kmers, inslen, covcut, fastq1name,fastq2name): | +
+ | #argString = "velveth " + name + "_k65 65 -shortPaired -fastq " + name + "_R1.fastq " + name + "_R2.fastq" | +
+ | argString = "velveth " + name + "_k"+ kmers+" "+ kmers + " -shortPaired -fastq " + fastq1name+" "+fastq2name | +
+ | print(argString) | +
+ | returncode = subprocess.call(argString, shell=True) | +
+ | if returncode != 0: | +
+ | return "Error in velveth" | +
+ | argString = "velvetg " + name + "_k"+kmers+" -exp_cov auto -ins_length "+inslen+" -clean yes -ins_length_sd 50 -min_pair_count 20" | +
+ | #argString = "velvetg " + name + "_k"+kmers+" -exp_cov auto -ins_length "+inslen+" -cov_cutoff "+covcut+" -clean yes -ins_length_sd 50 -min_pair_count 20" | +
+ | #argString = "velvetg " + name + "_k65 -exp_cov auto -ins_length 400 -cov_cutoff 5 -clean yes -ins_length_sd 50 -min_pair_count 20"+quietString | +
+ | print(argString) | +
+ | returncode = subprocess.call(argString, shell = True) | +
+ | if returncode != 0: | +
+ | return "Error in velvetg" | +
+ | shutil.copyfile(name + "_k"+kmers+"//contigs.fa",name + ".fa") # my $namechange = "mv ".$input."_k65/contigs.fa ".$input.".fa"; | +
+ | return "ok" | +
+ | + | +
+ | + | +
+ | def blastContigs(test_name,database): | +
+ | print(test_name) | +
+ | print(database) | +
+ | #db_path = os.path.dirname(os.path.realpath(__file__))+database | +
+ | db_path = database | +
+ | argString = "blastn -db "+db_path+" -query "+test_name+".fa -outfmt 10 -out "+test_name+"_blast.txt" | +
+ | print(argString) | +
+ | returncode = subprocess.call(argString, shell = True) | +
+ | if returncode != 0: | +
+ | return "Error in blastall" | +
+ | blast_df = pd.read_csv(""+test_name+"_blast.txt") | +
+ | #print (blast_df) | +
+ | #if ($temp[2] >= 98 & & $temp[3] > 100 & & $temp[10] < 0.001){ | +
+ | #'qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore' | +
+ | blast_df.columns = ['qaccver', 'saccver', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue','bitscore'] | +
+ | blastResult_df = blast_df[(blast_df['pident']>=98) & (blast_df['length'] > 100) & (blast_df['evalue']<0.001) ] | +
+ | blastResult_df = blastResult_df[['qaccver', 'saccver', 'pident']] #query accession.version, subject accession.version, Percentage of identical matches | +
+ | + | +
+ | return blastResult_df | +
+ | + | +
+ | + | +
+ | + | +
+ | def getCogsPresent(blastResult_df,strain,cogOrBinList): | +
+ | blastResult_df.sort_values('pident',axis = 0, ascending=False, inplace=True) | +
+ | nodeList = blastResult_df['qaccver'].tolist() | +
+ | cogList = blastResult_df['saccver'].tolist() | +
+ | cogSet = set(cogList) #get unique values | +
+ | cogList = sorted(cogSet) #sort them | +
+ | + | +
+ | #print (cogList) | +
+ | #print (len(cogList)) | +
+ | + | +
+ | thereList = [] | +
+ | dataList = [] | +
+ | #dir_path = os.path.dirname(os.path.realpath(__file__)) | +
+ | fname = cogOrBinList | +
+ | cnt = 0 | +
+ | with open (fname) as f: | +
+ | for line in f: | +
+ | dataList.append(line.rstrip('\n\r ')) | +
+ | if line.rstrip('\n\r ') in cogList: | +
+ | thereList.append('1') | +
+ | cnt = cnt+1 | +
+ | else: | +
+ | thereList.append('0') | +
+ | + | +
+ | #print (thereList) | +
+ | #print (cnt) | +
+ | data = {'Cog': dataList, strain: thereList} | +
+ | presence_df = pd.DataFrame(data) | +
+ | #print (presence_df) | +
+ | return presence_df | +
+ | + | +
+ | def addToCurrentData(cog_df, name): | +
+ | dir_path = os.path.dirname(os.path.realpath(__file__)) | +
+ | j_fname = dir_path + r"/data/vivax/TvDatabase.csv" | +
+ | tv_df = pd.read_csv(j_fname) | +
+ | + | +
+ | cogList = cog_df[name].tolist() | +
+ | #cogList.insert(0,'Test') | +
+ | #print (len(tv_df)) | +
+ | #print(len(cogList)) | +
+ | + | +
+ | #print(cogList) | +
+ | tv_df.loc[:,name]=cogList | +
+ | return tv_df | +
+ | + | +
+ | + | +
+ | + | +
+ | + | +
+ | def createClusterMap(tv_df,name,html_path,pdfExport): | +
+ | #Retrieve Data | +
+ | dir_path = os.path.dirname(os.path.realpath(__file__)) | +
+ | j_fname = dir_path+r"/data/vivax/geoTv.csv" | +
+ | geo_df = pd.read_csv(j_fname) | +
+ | geo_df.loc[len(geo_df)] =[name,name,'k'] | +
+ | print(geo_df) | +
+ | + | +
+ | myStrains = tv_df.columns.values.tolist() #beware first entry is COG | +
+ | myStrains = myStrains[1:] | +
+ | print(myStrains) | +
+ | myPal = [] | +
+ | for s in myStrains: | +
+ | col = geo_df[(geo_df['Strain'] == s)]['colour'].tolist() | +
+ | myPal.append(col[0]) | +
+ | print(myPal) | +
+ | mycogmap = ['skyblue', 'orangered'] # blue absent,red present | +
+ | tv_df.set_index('COG', inplace=True) | +
+ | tv_df = tv_df[tv_df.columns].astype(float) | +
+ | + | +
+ | cg = sns.clustermap(tv_df, method='ward', col_colors=myPal, cmap=mycogmap, yticklabels = 1500, row_cluster=False, linewidths = 0) | +
+ | #cg = sns.clustermap(tv_df, method='ward', row_cluster=False, linewidths = 0) | +
+ | ax = cg.ax_heatmap | +
+ | #xasix ticks and labels. | +
+ | ax.xaxis.tick_top() #set ticks at top | +
+ | newlabs = [] | +
+ | + | +
+ | labs = ax.xaxis.get_ticklabels() | +
+ | for i in range(0, len(labs)): | +
+ | print(labs[i]) | +
+ | # labs[i].set_text(" "+labs[i].get_text()) #make enough room so label sits atop of col_color bars | +
+ | newlabs.append(" " + labs[i].get_text()) | +
+ | ax.xaxis.set_ticklabels(newlabs) | +
+ | + | +
+ | #labs = ax.xaxis.get_ticklabels() | +
+ | #for i in range(0, len(labs)): | +
+ | # print(labs[i]) | +
+ | # labs[i].set_text(" "+labs[i].get_text()) #make enough room so label sits atop of col_color bars | +
+ | # print(labs[i]) | +
+ | #ax.xaxis.set_ticklabels(labs) | +
+ | plt.setp(cg.ax_heatmap.xaxis.get_ticklabels(), rotation=90) # get x labels printed vertically | +
+ | + | +
+ | cg.cax.set_visible(False) | +
+ | ax = cg.ax_heatmap | +
+ | ax.set_yticklabels("") | +
+ | ax.set_ylabel("") | +
+ | ax = cg.ax_heatmap | +
+ | # ax.set_title("Variant antigen profiles of T. vivax genomes.\nDendrogram reflects the VSG repertoire relationships of each strain inferred by the presence and absence of non-universal T. vivax VSG orthologs.", va = "top", wrap = "True") | +
+ | b = len(tv_df) | +
+ | print(b) | +
+ | title = "Figure Legend: The Variant Antigen Profiles of $\itTrypanosoma$ $\itvivax$ " \ | +
+ | "showing the \ncombination of present and absent diagnostic cluster of VSG orthologs " \ | +
+ | "across the sample cohort. \nDendrogram reflects the relationships amongst the VSG" \ | +
+ | " repertoires of each strain. " \ | +
+ | "Strains were isolated \nfrom multiple African countries as shown in the key.\nData was produced with the " \ | +
+ | "'Variant Antigen Profiler' (Silva Pereira et al., 2019)." | +
+ | + | +
+ | ax.text(-1.5, len(tv_df) + 8, | +
+ | title, | +
+ | ha="left", va="top", wrap="True") | +
+ | col = cg.ax_col_dendrogram.get_position() | +
+ | cg.ax_col_dendrogram.set_position([col.x0, col.y0*1.08, col.width, col.height*1.1]) | +
+ | + | +
+ | countryList = pd.unique(geo_df['Location']) | +
+ | colourList = pd.unique(geo_df['colour']) | +
+ | legend_elements = [Patch(facecolor='orangered', label='COG Present'), | +
+ | Patch(facecolor='skyblue', label='COG Absent')] | +
+ | + | +
+ | for i in range(0, len(colourList)): | +
+ | print("country = %s, colour = %s" % (countryList[i], colourList[i])) | +
+ | p = Patch(facecolor=str(colourList[i]), label=countryList[i]) | +
+ | legend_elements.append(p) | +
+ | + | +
+ | ax.legend(handles = legend_elements, bbox_to_anchor=(-0.3,1.2),loc = 'upper left') | +
+ | + | +
+ | + | +
+ | + | +
+ | + | +
+ | + | +
+ | + | +
+ | + | +
+ | #plt.setp(cg.ax_heatmap.yaxis.get_ticklabels(), rotation=0 ) # get y labels printed horizontally | +
+ | # cg.dendrogram_col.linkage # linkage matrix for columns | +
+ | # cg.dendrogram_row.linkage # linkage matrix for rows | +
+ | # plt.savefig(r"results/" + name + "_heatmap.png") | +
+ | #plt.savefig(htmlresource + "/heatmap.png") | +
+ | #if pdf == 'PDF_Yes': | +
+ | # plt.savefig(htmlresource + "/heatmap.pdf") | +
+ | # shutil.copyfile("heatmap.pdf",heatmapfn) # | +
+ | #plt.legend() | +
+ | fname = html_path+"/"+name+"_clustermap.png" | +
+ | cg.savefig(fname) | +
+ | if pdfExport == 'PDF_Yes': | +
+ | fname = html_path + "/" + name + "_clustermap.pdf" | +
+ | cg.savefig(fname) | +
+ | #plt.show() | +
+ | + | +
+ | def createHTML(name,htmlfn,htmlPath): | +
+ | #assumes imgs are heatmap.png, dheatmap.png, vapPCA.png and already in htmlresource | +
+ | htmlString = r"<html><title>T.vivax VAP</title><body><div style='text-align:center'><h2><i>Trypanosoma vivax</i> Variant Antigen Profile</h2><h3>" | +
+ | htmlString += name | +
+ | + | +
+ | + | +
+ | htmlString += r'<p> <h3>The Heat Map and Dendrogram</h3></p>' | +
+ | imgString = r"<img src = '"+name+"_clustermap.png' alt='Cog Clustermap' style='max-width:100%'><br><br>" | +
+ | htmlString += imgString | +
+ | print(htmlString) | +
+ | + | +
+ | with open(htmlfn, "w") as htmlfile: | +
+ | htmlfile.write(htmlString) | +
+ | + | +
+ | + | +
+ | def vivax_assemble(args,dict): | +
+ | #argdict = {'name': 2, 'pdfexport': 3, 'kmers': 4, 'inslen': 5, 'covcut': 6, 'forward': 7, 'reverse': 8, 'html_file': 9,'html_resource': 10} | +
+ | #assembleWithVelvet("V2_Test", '65', '400', '5', "data/TviBrRp.1.clean", "data/TviBrRp.2.clean") | +
+ | + | +
+ | vivaxPath = os.path.dirname(os.path.realpath(__file__))+r"/data/vivax" | +
+ | assembleWithVelvet(args[dict['name']], args[dict['kmers']], args[dict['inslen']], args[dict['covcut']], | +
+ | args[dict['forward']], args[dict['reverse']]) | +
+ | blastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/COGs.fas") | +
+ | orthPresence_df = getCogsPresent(blastResult_df, args[dict['name']], vivaxPath+r"/Database/COGlist.txt") | +
+ | + | +
+ | binBlastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/Bin_2.fas") | +
+ | binPresence_df = getCogsPresent(binBlastResult_df, args[dict['name']], vivaxPath+r"/Database/binlist.txt") | +
+ | cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True) | +
+ | + | +
+ | fname = args[dict['html_resource']] +args[dict['name']]+"_cogspresent.csv" | +
+ | cogPresence_df.to_csv(fname) | +
+ | current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it. | +
+ | createClusterMap(current_df, args[dict['name']],args[dict['html_resource']],args[dict['pdfexport']]) | +
+ | createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']]) | +
+ | + | +
+ | def test_cluster(args,dict): | +
+ | print ("name: %s",args[dict['name']]) | +
+ | cogPresence_df = pd.read_csv("test_cogspresent.csv") | +
+ | print(cogPresence_df) | +
+ | current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it. | +
+ | createClusterMap(current_df, args[dict['name']], args[dict['html_resource']], args[dict['pdfexport']]) | +
+ | + | +
+ | def vivax_contigs(args,dict): | +
+ | # argdict = {'name': 2, 'pdfexport': 3, 'contigs': 4, 'html_file': 5, 'html_resource': 6} | +
+ | + | +
+ | #test_cluster(args,dict) | +
+ | #return; | +
+ | + | +
+ | #subprocess.call('echo $PATH',shell = True) | +
+ | #sys.exit(1) | +
+ | + | +
+ | vivaxPath = os.path.dirname(os.path.realpath(__file__))+r"/data/vivax" | +
+ | shutil.copyfile(args[dict['contigs']], args[dict['name']]+".fa") | +
+ | blastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/COGs.fas") | +
+ | orthPresence_df = getCogsPresent(blastResult_df, args[dict['name']], vivaxPath+r"/Database/COGlist.txt") | +
+ | + | +
+ | binBlastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/Bin_2.fas") | +
+ | binPresence_df = getCogsPresent(binBlastResult_df, args[dict['name']], vivaxPath+r"/Database/binlist.txt") | +
+ | cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True) | +
+ | + | +
+ | fname = args[dict['html_resource']]+r"/"+ args[dict['name']]+"_cogspresent.csv" | +
+ | cogPresence_df.to_csv(fname) | +
+ | current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it. | +
+ | createClusterMap(current_df, args[dict['name']], args[dict['html_resource']], args[dict['pdfexport']]) | +
+ | createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']]) | +
+ | + | +
+ | + | +
+ | if __name__ == "__main__": | +
+ | sys.exit() | +
+ | + | +
+ | + | +
+ | + | +
+ | + | +