comparison Tryp_V.py @ 0:36cb22bd911d draft

planemo upload for repository https://github.com/johnheap/VAPPER-Galaxy
author johnheap
date Wed, 04 Jul 2018 16:39:13 -0400
parents
children 4432e4183ebd
comparison
equal deleted inserted replaced
-1:000000000000 0:36cb22bd911d
1 """
2 * Copyright 2018 University of Liverpool
3 * Author: John Heap, Computational Biology Facility, UoL
4 * Based on original scripts of Sara Silva Pereira, Institute of Infection and Global Health, UoL
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 """
19
20 import matplotlib as mpl
21 mpl.use('Agg')
22
23 import subprocess
24 import shutil
25 import re
26 import pandas as pd
27 import os
28
29 import sys
30
31 import matplotlib.pyplot as plt
32 from matplotlib.patches import Patch
33 import seaborn as sns
34
35 def assembleWithVelvet(name, kmers, inslen, covcut, fastq1name,fastq2name):
36 #argString = "velveth " + name + "_k65 65 -shortPaired -fastq " + name + "_R1.fastq " + name + "_R2.fastq"
37 argString = "velveth " + name + "_k"+ kmers+" "+ kmers + " -shortPaired -fastq " + fastq1name+" "+fastq2name
38 print(argString)
39 returncode = subprocess.call(argString, shell=True)
40 if returncode != 0:
41 return "Error in velveth"
42 argString = "velvetg " + name + "_k"+kmers+" -exp_cov auto -ins_length "+inslen+" -clean yes -ins_length_sd 50 -min_pair_count 20"
43 #argString = "velvetg " + name + "_k"+kmers+" -exp_cov auto -ins_length "+inslen+" -cov_cutoff "+covcut+" -clean yes -ins_length_sd 50 -min_pair_count 20"
44 #argString = "velvetg " + name + "_k65 -exp_cov auto -ins_length 400 -cov_cutoff 5 -clean yes -ins_length_sd 50 -min_pair_count 20"+quietString
45 print(argString)
46 returncode = subprocess.call(argString, shell = True)
47 if returncode != 0:
48 return "Error in velvetg"
49 shutil.copyfile(name + "_k"+kmers+"//contigs.fa",name + ".fa") # my $namechange = "mv ".$input."_k65/contigs.fa ".$input.".fa";
50 return "ok"
51
52
53 def blastContigs(test_name,database):
54 print(test_name)
55 print(database)
56 #db_path = os.path.dirname(os.path.realpath(__file__))+database
57 db_path = database
58 argString = "blastn -db "+db_path+" -query "+test_name+".fa -outfmt 10 -out "+test_name+"_blast.txt"
59 print (argString)
60 returncode = subprocess.call(argString, shell = True)
61 if returncode != 0:
62 return "Error in blastall"
63 blast_df = pd.read_csv(""+test_name+"_blast.txt")
64 #print (blast_df)
65 #if ($temp[2] >= 98 & & $temp[3] > 100 & & $temp[10] < 0.001){
66 #'qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore'
67 blast_df.columns = ['qaccver', 'saccver', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue','bitscore']
68 blastResult_df = blast_df[(blast_df['pident']>=98) & (blast_df['length'] > 100) & (blast_df['evalue']<0.001) ]
69 blastResult_df = blastResult_df[['qaccver', 'saccver', 'pident']] #query accession.version, subject accession.version, Percentage of identical matches
70
71 return blastResult_df
72
73
74
75 def getCogsPresent(blastResult_df,strain,cogOrBinList):
76 blastResult_df.sort_values('pident',axis = 0, ascending=False, inplace=True)
77 nodeList = blastResult_df['qaccver'].tolist()
78 cogList = blastResult_df['saccver'].tolist()
79 cogSet = set(cogList) #get unique values
80 cogList = sorted(cogSet) #sort them
81
82 #print (cogList)
83 #print (len(cogList))
84
85 thereList = []
86 dataList = []
87 #dir_path = os.path.dirname(os.path.realpath(__file__))
88 fname = cogOrBinList
89 cnt = 0
90 with open (fname) as f:
91 for line in f:
92 dataList.append(line.rstrip('\n\r '))
93 if line.rstrip('\n\r ') in cogList:
94 thereList.append('1')
95 cnt = cnt+1
96 else:
97 thereList.append('0')
98
99 #print (thereList)
100 #print (cnt)
101 data = {'Cog': dataList, strain: thereList}
102 presence_df = pd.DataFrame(data)
103 #print (presence_df)
104 return presence_df
105
106 def addToCurrentData(cog_df, name):
107 dir_path = os.path.dirname(os.path.realpath(__file__))
108 j_fname = dir_path + r"/data/vivax/TvDatabase.csv"
109 tv_df = pd.read_csv(j_fname)
110
111 cogList = cog_df[name].tolist()
112 #cogList.insert(0,'Test')
113 #print (len(tv_df))
114 #print(len(cogList))
115
116 #print(cogList)
117 tv_df.loc[:,name]=cogList
118 return tv_df
119
120
121
122
123 def createClusterMap(tv_df,name,html_path,pdfExport):
124 #Retrieve Data
125 dir_path = os.path.dirname(os.path.realpath(__file__))
126 j_fname = dir_path+r"/data/vivax/geoTv.csv"
127 geo_df = pd.read_csv(j_fname)
128 geo_df.loc[len(geo_df)] =[name,name,'k']
129 print(geo_df)
130
131 myStrains = tv_df.columns.values.tolist() #beware first entry is COG
132 myStrains = myStrains[1:]
133 print(myStrains)
134 myPal = []
135 for s in myStrains:
136 col = geo_df[(geo_df['Strain'] == s)]['colour'].tolist()
137 myPal.append(col[0])
138 print(myPal)
139 mycogmap = ['skyblue', 'orangered'] # blue absent,red present
140 tv_df.set_index('COG', inplace=True)
141 tv_df = tv_df[tv_df.columns].astype(float)
142
143 cg = sns.clustermap(tv_df, method='ward', col_colors=myPal, cmap=mycogmap, yticklabels = 1500, row_cluster=False, linewidths = 0)
144 #cg = sns.clustermap(tv_df, method='ward', row_cluster=False, linewidths = 0)
145 ax = cg.ax_heatmap
146 #xasix ticks and labels.
147 ax.xaxis.tick_top() #set ticks at top
148 newlabs = []
149
150 labs = ax.xaxis.get_ticklabels()
151 for i in range(0, len(labs)):
152 print(labs[i])
153 # labs[i].set_text(" "+labs[i].get_text()) #make enough room so label sits atop of col_color bars
154 newlabs.append(" " + labs[i].get_text())
155 ax.xaxis.set_ticklabels(newlabs)
156
157 #labs = ax.xaxis.get_ticklabels()
158 #for i in range(0, len(labs)):
159 # print(labs[i])
160 # labs[i].set_text(" "+labs[i].get_text()) #make enough room so label sits atop of col_color bars
161 # print(labs[i])
162 #ax.xaxis.set_ticklabels(labs)
163 plt.setp(cg.ax_heatmap.xaxis.get_ticklabels(), rotation=90) # get x labels printed vertically
164
165 cg.cax.set_visible(False)
166 ax = cg.ax_heatmap
167 ax.set_yticklabels("")
168 ax.set_ylabel("")
169 ax = cg.ax_heatmap
170 # ax.set_title("Variant antigen profiles of T. vivax genomes.\nDendrogram reflects the VSG repertoire relationships of each strain inferred by the presence and absence of non-universal T. vivax VSG orthologs.", va = "top", wrap = "True")
171 b = len(tv_df)
172 print(b)
173 title = "Figure Legend: The Variant Antigen Profiles of $\itTrypanosoma$ $\itvivax$ " \
174 "showing the \ncombination of present and absent diagnostic cluster of VSG orthologs " \
175 "across the sample cohort. \nDendrogram reflects the relationships amongst the VSG" \
176 " repertoires of each strain. " \
177 "Strains were isolated \nfrom multiple African countries as shown in the key.\nData was produced with the " \
178 "'Variant Antigen Profiler' (Silva Pereira and Jackson, 2018)."
179
180 ax.text(-1.5, len(tv_df) + 8,
181 title,
182 ha="left", va="top", wrap="True")
183 col = cg.ax_col_dendrogram.get_position()
184 cg.ax_col_dendrogram.set_position([col.x0, col.y0*1.08, col.width, col.height*1.1])
185
186
187 legend_elements = [Patch(facecolor='orangered', label='COG Present'),
188 Patch (facecolor='skyblue', label='COG Absent'),
189 Patch(facecolor='w', label=''),
190 Patch (facecolor='b', label='Nigeria'),
191 Patch(facecolor = 'g', label='Uganda'),
192 Patch (facecolor='c', label='Gambia'),
193 Patch (facecolor='r', label='Ivory Coast'),
194 Patch(facecolor='m', label='Brazil'),
195 Patch(facecolor='k', label=name)]
196 #legend_test = [[Patch(facecolor='orangered'),Patch(facecolor='r')],["test","test2"]]
197 ax.legend(handles = legend_elements, bbox_to_anchor=(-0.3,1.2),loc = 'upper left')
198
199
200
201
202
203
204
205 #plt.setp(cg.ax_heatmap.yaxis.get_ticklabels(), rotation=0 ) # get y labels printed horizontally
206 # cg.dendrogram_col.linkage # linkage matrix for columns
207 # cg.dendrogram_row.linkage # linkage matrix for rows
208 # plt.savefig(r"results/" + name + "_heatmap.png")
209 #plt.savefig(htmlresource + "/heatmap.png")
210 #if pdf == 'PDF_Yes':
211 # plt.savefig(htmlresource + "/heatmap.pdf")
212 # shutil.copyfile("heatmap.pdf",heatmapfn) #
213 #plt.legend()
214 fname = html_path+"/"+name+"_clustermap.png"
215 cg.savefig(fname)
216 if pdfExport == 'PDF_Yes':
217 fname = html_path + "/" + name + "_clustermap.pdf"
218 cg.savefig(fname)
219 #plt.show()
220
221 def createHTML(name,htmlfn,htmlPath):
222 #assumes imgs are heatmap.png, dheatmap.png, vapPCA.png and already in htmlresource
223 htmlString = r"<html><title>T.vivax VAP</title><body><div style='text-align:center'><h2><i>Trypanosoma vivax</i> Variant Antigen Profile</h2><h3>"
224 htmlString += name
225
226
227 htmlString += r'<p> <h3>The Heat Map and Dendrogram</h3></p>'
228 imgString = r"<img src = '"+name+"_clustermap.png' alt='Cog Clustermap' style='max-width:100%'><br><br>"
229 htmlString += imgString
230 print(htmlString)
231
232 with open(htmlfn, "w") as htmlfile:
233 htmlfile.write(htmlString)
234
235
236 def vivax_assemble(args,dict):
237 #argdict = {'name': 2, 'pdfexport': 3, 'kmers': 4, 'inslen': 5, 'covcut': 6, 'forward': 7, 'reverse': 8, 'html_file': 9,'html_resource': 10}
238 #assembleWithVelvet("V2_Test", '65', '400', '5', "data/TviBrRp.1.clean", "data/TviBrRp.2.clean")
239
240 vivaxPath = os.path.dirname(os.path.realpath(__file__))+r"/data/vivax"
241 assembleWithVelvet(args[dict['name']], args[dict['kmers']], args[dict['inslen']], args[dict['covcut']],
242 args[dict['forward']], args[dict['reverse']])
243 blastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/COGs.fas")
244 orthPresence_df = getCogsPresent(blastResult_df, args[dict['name']], vivaxPath+r"/Database/COGlist.txt")
245
246 binBlastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/Bin_2.fas")
247 binPresence_df = getCogsPresent(binBlastResult_df, args[dict['name']], vivaxPath+r"/Database/binlist.txt")
248 cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True)
249
250 fname = args[dict['html_resource']] +args[dict['name']]+"_cogspresent.csv"
251 cogPresence_df.to_csv(fname)
252 current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it.
253 createClusterMap(current_df, args[dict['name']],args[dict['html_resource']],args[dict['pdfexport']])
254 createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']])
255
256 def test_cluster(args,dict):
257 print ("name: %s",args[dict['name']])
258 cogPresence_df = pd.read_csv("test_cogspresent.csv")
259 print(cogPresence_df)
260 current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it.
261 createClusterMap(current_df, args[dict['name']], args[dict['html_resource']], args[dict['pdfexport']])
262
263 def vivax_contigs(args,dict):
264 # argdict = {'name': 2, 'pdfexport': 3, 'contigs': 4, 'html_file': 5, 'html_resource': 6}
265
266 #test_cluster(args,dict)
267 #return;
268
269 #subprocess.call('echo $PATH',shell = True)
270 #sys.exit(1)
271
272 vivaxPath = os.path.dirname(os.path.realpath(__file__))+r"/data/vivax"
273 shutil.copyfile(args[dict['contigs']], args[dict['name']]+".fa")
274 blastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/COGs.fas")
275 orthPresence_df = getCogsPresent(blastResult_df, args[dict['name']], vivaxPath+r"/Database/COGlist.txt")
276
277 binBlastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/Bin_2.fas")
278 binPresence_df = getCogsPresent(binBlastResult_df, args[dict['name']], vivaxPath+r"/Database/binlist.txt")
279 cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True)
280
281 fname = args[dict['html_resource']]+r"/"+ args[dict['name']]+"_cogspresent.csv"
282 cogPresence_df.to_csv(fname)
283 current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it.
284 createClusterMap(current_df, args[dict['name']], args[dict['html_resource']], args[dict['pdfexport']])
285 createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']])
286
287 if __name__ == "__main__":
288 #assembleWithVelvet("V2_Test",'65','400', '5',"data/TviBrRp.1.clean","data/TviBrRp.2.clean")
289 #assembleWithVelvet("V2_Test",'65','400', '5',"data/Tv493.1","data/Tv493.2")
290 #blastResult_df=blastContigs("V2_Test",r"/Database/COGs.fas")
291 cogPresence_df = pd.read_csv("test_cogspresent.csv")
292 print(cogPresence_df)
293 current_df = addToCurrentData(cogPresence_df,"vTest") # load in Tvdatabase and add cogPresence column to it.
294 createClusterMap(current_df, "vTest", "sausages","no")
295 createHTML("vTest","vTest.html","sausages")
296 sys.exit()
297
298 blastResult_df=blastContigs("Tv493",r"/Database/COGs.fas")
299 orthPresence_df = getCogsPresent(blastResult_df,"Tv493",r"/Database/COGlist.txt")
300
301 #binBlastResult_df=blastContigs("V2_Test",r"/Database/Bin_2.fas")
302
303 binBlastResult_df=blastContigs("Tv493",r"/Database/Bin_2.fas")
304 binPresence_df = getCogsPresent(binBlastResult_df,"Tv493",r"/Database/binlist.txt")
305 cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True)
306 #now do the next bit?
307 current_df = addToCurrentData(cogPresence_df) # load in Tvdatabase and add cogPresence column to it.
308 createClusterMap(current_df,'Test',dict['html_resource'] ,dict['pdfexport'])
309
310
311 #print(cogPresence_df)
312 dir_path = os.path.dirname(os.path.realpath(__file__))
313 fname = dir_path+r"/results/V2_TestPresence.csv"
314 #fnameb = dir_path+r"/results/V2_Test_blastOrth.csv"
315 #fnameb_bin = dir_path+r"/results/V2_Test_blastBin.csv"
316 #binBlastResult_df.to_csv(fnameb_bin)
317 #blastResult_df.to_csv(fnameb)
318
319 #cogPresence_df.to_csv(fname)
320 cogPresence_df = pd.read_csv(fname)
321
322 current_df = addToCurrentData(cogPresence_df) #load in Tvdatabase and add cogPresence column to it.
323
324
325
326