Mercurial > repos > johnheap > vapper
comparison Tryp_V.py @ 0:36cb22bd911d draft
planemo upload for repository https://github.com/johnheap/VAPPER-Galaxy
author | johnheap |
---|---|
date | Wed, 04 Jul 2018 16:39:13 -0400 |
parents | |
children | 4432e4183ebd |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:36cb22bd911d |
---|---|
1 """ | |
2 * Copyright 2018 University of Liverpool | |
3 * Author: John Heap, Computational Biology Facility, UoL | |
4 * Based on original scripts of Sara Silva Pereira, Institute of Infection and Global Health, UoL | |
5 * | |
6 * Licensed under the Apache License, Version 2.0 (the "License"); | |
7 * you may not use this file except in compliance with the License. | |
8 * You may obtain a copy of the License at | |
9 * | |
10 * http://www.apache.org/licenses/LICENSE-2.0 | |
11 * | |
12 * Unless required by applicable law or agreed to in writing, software | |
13 * distributed under the License is distributed on an "AS IS" BASIS, | |
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
15 * See the License for the specific language governing permissions and | |
16 * limitations under the License. | |
17 * | |
18 """ | |
19 | |
20 import matplotlib as mpl | |
21 mpl.use('Agg') | |
22 | |
23 import subprocess | |
24 import shutil | |
25 import re | |
26 import pandas as pd | |
27 import os | |
28 | |
29 import sys | |
30 | |
31 import matplotlib.pyplot as plt | |
32 from matplotlib.patches import Patch | |
33 import seaborn as sns | |
34 | |
35 def assembleWithVelvet(name, kmers, inslen, covcut, fastq1name,fastq2name): | |
36 #argString = "velveth " + name + "_k65 65 -shortPaired -fastq " + name + "_R1.fastq " + name + "_R2.fastq" | |
37 argString = "velveth " + name + "_k"+ kmers+" "+ kmers + " -shortPaired -fastq " + fastq1name+" "+fastq2name | |
38 print(argString) | |
39 returncode = subprocess.call(argString, shell=True) | |
40 if returncode != 0: | |
41 return "Error in velveth" | |
42 argString = "velvetg " + name + "_k"+kmers+" -exp_cov auto -ins_length "+inslen+" -clean yes -ins_length_sd 50 -min_pair_count 20" | |
43 #argString = "velvetg " + name + "_k"+kmers+" -exp_cov auto -ins_length "+inslen+" -cov_cutoff "+covcut+" -clean yes -ins_length_sd 50 -min_pair_count 20" | |
44 #argString = "velvetg " + name + "_k65 -exp_cov auto -ins_length 400 -cov_cutoff 5 -clean yes -ins_length_sd 50 -min_pair_count 20"+quietString | |
45 print(argString) | |
46 returncode = subprocess.call(argString, shell = True) | |
47 if returncode != 0: | |
48 return "Error in velvetg" | |
49 shutil.copyfile(name + "_k"+kmers+"//contigs.fa",name + ".fa") # my $namechange = "mv ".$input."_k65/contigs.fa ".$input.".fa"; | |
50 return "ok" | |
51 | |
52 | |
53 def blastContigs(test_name,database): | |
54 print(test_name) | |
55 print(database) | |
56 #db_path = os.path.dirname(os.path.realpath(__file__))+database | |
57 db_path = database | |
58 argString = "blastn -db "+db_path+" -query "+test_name+".fa -outfmt 10 -out "+test_name+"_blast.txt" | |
59 print (argString) | |
60 returncode = subprocess.call(argString, shell = True) | |
61 if returncode != 0: | |
62 return "Error in blastall" | |
63 blast_df = pd.read_csv(""+test_name+"_blast.txt") | |
64 #print (blast_df) | |
65 #if ($temp[2] >= 98 & & $temp[3] > 100 & & $temp[10] < 0.001){ | |
66 #'qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore' | |
67 blast_df.columns = ['qaccver', 'saccver', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue','bitscore'] | |
68 blastResult_df = blast_df[(blast_df['pident']>=98) & (blast_df['length'] > 100) & (blast_df['evalue']<0.001) ] | |
69 blastResult_df = blastResult_df[['qaccver', 'saccver', 'pident']] #query accession.version, subject accession.version, Percentage of identical matches | |
70 | |
71 return blastResult_df | |
72 | |
73 | |
74 | |
75 def getCogsPresent(blastResult_df,strain,cogOrBinList): | |
76 blastResult_df.sort_values('pident',axis = 0, ascending=False, inplace=True) | |
77 nodeList = blastResult_df['qaccver'].tolist() | |
78 cogList = blastResult_df['saccver'].tolist() | |
79 cogSet = set(cogList) #get unique values | |
80 cogList = sorted(cogSet) #sort them | |
81 | |
82 #print (cogList) | |
83 #print (len(cogList)) | |
84 | |
85 thereList = [] | |
86 dataList = [] | |
87 #dir_path = os.path.dirname(os.path.realpath(__file__)) | |
88 fname = cogOrBinList | |
89 cnt = 0 | |
90 with open (fname) as f: | |
91 for line in f: | |
92 dataList.append(line.rstrip('\n\r ')) | |
93 if line.rstrip('\n\r ') in cogList: | |
94 thereList.append('1') | |
95 cnt = cnt+1 | |
96 else: | |
97 thereList.append('0') | |
98 | |
99 #print (thereList) | |
100 #print (cnt) | |
101 data = {'Cog': dataList, strain: thereList} | |
102 presence_df = pd.DataFrame(data) | |
103 #print (presence_df) | |
104 return presence_df | |
105 | |
106 def addToCurrentData(cog_df, name): | |
107 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
108 j_fname = dir_path + r"/data/vivax/TvDatabase.csv" | |
109 tv_df = pd.read_csv(j_fname) | |
110 | |
111 cogList = cog_df[name].tolist() | |
112 #cogList.insert(0,'Test') | |
113 #print (len(tv_df)) | |
114 #print(len(cogList)) | |
115 | |
116 #print(cogList) | |
117 tv_df.loc[:,name]=cogList | |
118 return tv_df | |
119 | |
120 | |
121 | |
122 | |
123 def createClusterMap(tv_df,name,html_path,pdfExport): | |
124 #Retrieve Data | |
125 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
126 j_fname = dir_path+r"/data/vivax/geoTv.csv" | |
127 geo_df = pd.read_csv(j_fname) | |
128 geo_df.loc[len(geo_df)] =[name,name,'k'] | |
129 print(geo_df) | |
130 | |
131 myStrains = tv_df.columns.values.tolist() #beware first entry is COG | |
132 myStrains = myStrains[1:] | |
133 print(myStrains) | |
134 myPal = [] | |
135 for s in myStrains: | |
136 col = geo_df[(geo_df['Strain'] == s)]['colour'].tolist() | |
137 myPal.append(col[0]) | |
138 print(myPal) | |
139 mycogmap = ['skyblue', 'orangered'] # blue absent,red present | |
140 tv_df.set_index('COG', inplace=True) | |
141 tv_df = tv_df[tv_df.columns].astype(float) | |
142 | |
143 cg = sns.clustermap(tv_df, method='ward', col_colors=myPal, cmap=mycogmap, yticklabels = 1500, row_cluster=False, linewidths = 0) | |
144 #cg = sns.clustermap(tv_df, method='ward', row_cluster=False, linewidths = 0) | |
145 ax = cg.ax_heatmap | |
146 #xasix ticks and labels. | |
147 ax.xaxis.tick_top() #set ticks at top | |
148 newlabs = [] | |
149 | |
150 labs = ax.xaxis.get_ticklabels() | |
151 for i in range(0, len(labs)): | |
152 print(labs[i]) | |
153 # labs[i].set_text(" "+labs[i].get_text()) #make enough room so label sits atop of col_color bars | |
154 newlabs.append(" " + labs[i].get_text()) | |
155 ax.xaxis.set_ticklabels(newlabs) | |
156 | |
157 #labs = ax.xaxis.get_ticklabels() | |
158 #for i in range(0, len(labs)): | |
159 # print(labs[i]) | |
160 # labs[i].set_text(" "+labs[i].get_text()) #make enough room so label sits atop of col_color bars | |
161 # print(labs[i]) | |
162 #ax.xaxis.set_ticklabels(labs) | |
163 plt.setp(cg.ax_heatmap.xaxis.get_ticklabels(), rotation=90) # get x labels printed vertically | |
164 | |
165 cg.cax.set_visible(False) | |
166 ax = cg.ax_heatmap | |
167 ax.set_yticklabels("") | |
168 ax.set_ylabel("") | |
169 ax = cg.ax_heatmap | |
170 # ax.set_title("Variant antigen profiles of T. vivax genomes.\nDendrogram reflects the VSG repertoire relationships of each strain inferred by the presence and absence of non-universal T. vivax VSG orthologs.", va = "top", wrap = "True") | |
171 b = len(tv_df) | |
172 print(b) | |
173 title = "Figure Legend: The Variant Antigen Profiles of $\itTrypanosoma$ $\itvivax$ " \ | |
174 "showing the \ncombination of present and absent diagnostic cluster of VSG orthologs " \ | |
175 "across the sample cohort. \nDendrogram reflects the relationships amongst the VSG" \ | |
176 " repertoires of each strain. " \ | |
177 "Strains were isolated \nfrom multiple African countries as shown in the key.\nData was produced with the " \ | |
178 "'Variant Antigen Profiler' (Silva Pereira and Jackson, 2018)." | |
179 | |
180 ax.text(-1.5, len(tv_df) + 8, | |
181 title, | |
182 ha="left", va="top", wrap="True") | |
183 col = cg.ax_col_dendrogram.get_position() | |
184 cg.ax_col_dendrogram.set_position([col.x0, col.y0*1.08, col.width, col.height*1.1]) | |
185 | |
186 | |
187 legend_elements = [Patch(facecolor='orangered', label='COG Present'), | |
188 Patch (facecolor='skyblue', label='COG Absent'), | |
189 Patch(facecolor='w', label=''), | |
190 Patch (facecolor='b', label='Nigeria'), | |
191 Patch(facecolor = 'g', label='Uganda'), | |
192 Patch (facecolor='c', label='Gambia'), | |
193 Patch (facecolor='r', label='Ivory Coast'), | |
194 Patch(facecolor='m', label='Brazil'), | |
195 Patch(facecolor='k', label=name)] | |
196 #legend_test = [[Patch(facecolor='orangered'),Patch(facecolor='r')],["test","test2"]] | |
197 ax.legend(handles = legend_elements, bbox_to_anchor=(-0.3,1.2),loc = 'upper left') | |
198 | |
199 | |
200 | |
201 | |
202 | |
203 | |
204 | |
205 #plt.setp(cg.ax_heatmap.yaxis.get_ticklabels(), rotation=0 ) # get y labels printed horizontally | |
206 # cg.dendrogram_col.linkage # linkage matrix for columns | |
207 # cg.dendrogram_row.linkage # linkage matrix for rows | |
208 # plt.savefig(r"results/" + name + "_heatmap.png") | |
209 #plt.savefig(htmlresource + "/heatmap.png") | |
210 #if pdf == 'PDF_Yes': | |
211 # plt.savefig(htmlresource + "/heatmap.pdf") | |
212 # shutil.copyfile("heatmap.pdf",heatmapfn) # | |
213 #plt.legend() | |
214 fname = html_path+"/"+name+"_clustermap.png" | |
215 cg.savefig(fname) | |
216 if pdfExport == 'PDF_Yes': | |
217 fname = html_path + "/" + name + "_clustermap.pdf" | |
218 cg.savefig(fname) | |
219 #plt.show() | |
220 | |
221 def createHTML(name,htmlfn,htmlPath): | |
222 #assumes imgs are heatmap.png, dheatmap.png, vapPCA.png and already in htmlresource | |
223 htmlString = r"<html><title>T.vivax VAP</title><body><div style='text-align:center'><h2><i>Trypanosoma vivax</i> Variant Antigen Profile</h2><h3>" | |
224 htmlString += name | |
225 | |
226 | |
227 htmlString += r'<p> <h3>The Heat Map and Dendrogram</h3></p>' | |
228 imgString = r"<img src = '"+name+"_clustermap.png' alt='Cog Clustermap' style='max-width:100%'><br><br>" | |
229 htmlString += imgString | |
230 print(htmlString) | |
231 | |
232 with open(htmlfn, "w") as htmlfile: | |
233 htmlfile.write(htmlString) | |
234 | |
235 | |
236 def vivax_assemble(args,dict): | |
237 #argdict = {'name': 2, 'pdfexport': 3, 'kmers': 4, 'inslen': 5, 'covcut': 6, 'forward': 7, 'reverse': 8, 'html_file': 9,'html_resource': 10} | |
238 #assembleWithVelvet("V2_Test", '65', '400', '5', "data/TviBrRp.1.clean", "data/TviBrRp.2.clean") | |
239 | |
240 vivaxPath = os.path.dirname(os.path.realpath(__file__))+r"/data/vivax" | |
241 assembleWithVelvet(args[dict['name']], args[dict['kmers']], args[dict['inslen']], args[dict['covcut']], | |
242 args[dict['forward']], args[dict['reverse']]) | |
243 blastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/COGs.fas") | |
244 orthPresence_df = getCogsPresent(blastResult_df, args[dict['name']], vivaxPath+r"/Database/COGlist.txt") | |
245 | |
246 binBlastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/Bin_2.fas") | |
247 binPresence_df = getCogsPresent(binBlastResult_df, args[dict['name']], vivaxPath+r"/Database/binlist.txt") | |
248 cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True) | |
249 | |
250 fname = args[dict['html_resource']] +args[dict['name']]+"_cogspresent.csv" | |
251 cogPresence_df.to_csv(fname) | |
252 current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it. | |
253 createClusterMap(current_df, args[dict['name']],args[dict['html_resource']],args[dict['pdfexport']]) | |
254 createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']]) | |
255 | |
256 def test_cluster(args,dict): | |
257 print ("name: %s",args[dict['name']]) | |
258 cogPresence_df = pd.read_csv("test_cogspresent.csv") | |
259 print(cogPresence_df) | |
260 current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it. | |
261 createClusterMap(current_df, args[dict['name']], args[dict['html_resource']], args[dict['pdfexport']]) | |
262 | |
263 def vivax_contigs(args,dict): | |
264 # argdict = {'name': 2, 'pdfexport': 3, 'contigs': 4, 'html_file': 5, 'html_resource': 6} | |
265 | |
266 #test_cluster(args,dict) | |
267 #return; | |
268 | |
269 #subprocess.call('echo $PATH',shell = True) | |
270 #sys.exit(1) | |
271 | |
272 vivaxPath = os.path.dirname(os.path.realpath(__file__))+r"/data/vivax" | |
273 shutil.copyfile(args[dict['contigs']], args[dict['name']]+".fa") | |
274 blastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/COGs.fas") | |
275 orthPresence_df = getCogsPresent(blastResult_df, args[dict['name']], vivaxPath+r"/Database/COGlist.txt") | |
276 | |
277 binBlastResult_df = blastContigs(args[dict['name']], vivaxPath+r"/Database/Bin_2.fas") | |
278 binPresence_df = getCogsPresent(binBlastResult_df, args[dict['name']], vivaxPath+r"/Database/binlist.txt") | |
279 cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True) | |
280 | |
281 fname = args[dict['html_resource']]+r"/"+ args[dict['name']]+"_cogspresent.csv" | |
282 cogPresence_df.to_csv(fname) | |
283 current_df = addToCurrentData(cogPresence_df,args[dict['name']]) # load in Tvdatabase and add cogPresence column to it. | |
284 createClusterMap(current_df, args[dict['name']], args[dict['html_resource']], args[dict['pdfexport']]) | |
285 createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']]) | |
286 | |
287 if __name__ == "__main__": | |
288 #assembleWithVelvet("V2_Test",'65','400', '5',"data/TviBrRp.1.clean","data/TviBrRp.2.clean") | |
289 #assembleWithVelvet("V2_Test",'65','400', '5',"data/Tv493.1","data/Tv493.2") | |
290 #blastResult_df=blastContigs("V2_Test",r"/Database/COGs.fas") | |
291 cogPresence_df = pd.read_csv("test_cogspresent.csv") | |
292 print(cogPresence_df) | |
293 current_df = addToCurrentData(cogPresence_df,"vTest") # load in Tvdatabase and add cogPresence column to it. | |
294 createClusterMap(current_df, "vTest", "sausages","no") | |
295 createHTML("vTest","vTest.html","sausages") | |
296 sys.exit() | |
297 | |
298 blastResult_df=blastContigs("Tv493",r"/Database/COGs.fas") | |
299 orthPresence_df = getCogsPresent(blastResult_df,"Tv493",r"/Database/COGlist.txt") | |
300 | |
301 #binBlastResult_df=blastContigs("V2_Test",r"/Database/Bin_2.fas") | |
302 | |
303 binBlastResult_df=blastContigs("Tv493",r"/Database/Bin_2.fas") | |
304 binPresence_df = getCogsPresent(binBlastResult_df,"Tv493",r"/Database/binlist.txt") | |
305 cogPresence_df = orthPresence_df.append(binPresence_df, ignore_index=True) | |
306 #now do the next bit? | |
307 current_df = addToCurrentData(cogPresence_df) # load in Tvdatabase and add cogPresence column to it. | |
308 createClusterMap(current_df,'Test',dict['html_resource'] ,dict['pdfexport']) | |
309 | |
310 | |
311 #print(cogPresence_df) | |
312 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
313 fname = dir_path+r"/results/V2_TestPresence.csv" | |
314 #fnameb = dir_path+r"/results/V2_Test_blastOrth.csv" | |
315 #fnameb_bin = dir_path+r"/results/V2_Test_blastBin.csv" | |
316 #binBlastResult_df.to_csv(fnameb_bin) | |
317 #blastResult_df.to_csv(fnameb) | |
318 | |
319 #cogPresence_df.to_csv(fname) | |
320 cogPresence_df = pd.read_csv(fname) | |
321 | |
322 current_df = addToCurrentData(cogPresence_df) #load in Tvdatabase and add cogPresence column to it. | |
323 | |
324 | |
325 | |
326 |