comparison Tryp_T.py @ 0:36cb22bd911d draft

planemo upload for repository https://github.com/johnheap/VAPPER-Galaxy
author johnheap
date Wed, 04 Jul 2018 16:39:13 -0400
parents
children 8f6469ffef85
comparison
equal deleted inserted replaced
-1:000000000000 0:36cb22bd911d
1 """
2 * Copyright 2018 University of Liverpool
3 * Author: John Heap, Computational Biology Facility, UoL
4 * Based on original scripts of Sara Silva Pereira, Institute of Infection and Global Health, UoL
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 """
19
20
21 import subprocess
22 import pandas as pd
23 import re
24 import os
25 import sys
26 import matplotlib as mpl
27 mpl.use('Agg')
28 import matplotlib.pyplot as plt
29
30 pList = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15']
31 quietString = "" #"">> Vap_log.txt 2>&1"
32 def transcriptMapping(inputname, strain, forwardFN,reverseFN):
33 #where is our Reference data -
34 dir_path = os.path.dirname(os.path.realpath(__file__))
35 refName = dir_path+"/data/Reference/Tc148" #default
36 if strain == "Tc148":
37 refName = dir_path+"/data/Reference/Tc148"
38 if strain == "IL3000":
39 refName = dir_path+"/data/Reference/IL3000"
40 #argString = "bowtie2 -x Refe4rence/IL3000 -1 data/"+forwardFN+" -2 data/"+reverseFN+" -S "+inputname+".sam" #>log.txt
41 #argString = "bowtie2 -x Reference/Tc148 -1 data/"+forwardFN+" -2 data/"+reverseFN+" -S "+inputname+".sam" #>log.txt
42 argString = "bowtie2 -x "+refName+" -1 "+forwardFN+" -2 "+reverseFN+" -S "+inputname+".sam"+quietString #>log.txt
43 #print(argString)
44 returncode = subprocess.call(argString, shell=True)
45
46 def processSamFiles(inputname):
47 #debug use a mapping sam file we have already found
48 #dir_path = os.path.dirname(os.path.realpath(__file__))
49 #bugName = dir_path+"/data/T_Test" #defasult
50
51 cur_path = os.getcwd()
52 samName = cur_path+"/"+inputname
53
54 #argString = "samtools view -bS "+bugName+" > "+inputname+".bam"
55 argString = "samtools view -bS "+inputname+".sam > "+samName+".bam"+quietString
56 #print(argString)
57 returncode = subprocess.call(argString, shell=True)
58
59
60 #argString = "samtools sort "+bugName+" -o "+inputname+".sorted"
61 argString = "samtools sort "+samName+".bam -o "+samName+".sorted"+quietString
62 #print("argstring = "+argString)
63 returncode = subprocess.call(argString, shell=True)
64
65 #argString = "samtools index "+bugName+".sorted "+inputname+".sorted.bai"
66 argString = "samtools index "+samName+".sorted "+samName+".sorted.bai"+quietString
67 #print("argstring = " + argString)
68 returncode = subprocess.call(argString, shell=True)
69
70
71
72
73 def transcriptAbundance(inputname, strain):
74 dir_path = os.path.dirname(os.path.realpath(__file__))
75 refName = dir_path + "/data/Reference/ORFAnnotation.gtf" # defasult
76 if strain == "Tc148":
77 refName = dir_path + "/data/Reference/ORFAnnotation.gtf"
78 if strain == "IL3000":
79 refName = dir_path + "/data/Reference/IL3000.gtf"
80 #argString = "cufflinks -G Reference/IL3000.gtf -o "+inputname+".cuff -u -p 8 "+inputname+".sorted"
81 #argString = "cufflinks -G Reference/ORFAnnotation.gtf -o "+inputname+".cuff -u -p 8 "+inputname+".sorted"
82 argString = "cufflinks -q -G "+refName+" -o "+inputname+".cuff -u -p 8 "+inputname+".sorted"+quietString
83 returncode = subprocess.call(argString, shell = True)
84
85
86 def convertToFasta(inputName, strain): #equivalent to Sara's awk scripte
87 dir_path = os.path.dirname(os.path.realpath(__file__))
88 refName = dir_path + "/data/Reference/ORFAnnotation.gtf" # default
89 if strain == "Tc148":
90 refName = dir_path + "/data/Reference/148_prot.fasta"
91 if strain == "IL3000":
92 refName = dir_path + "data/Reference/IL3000_prot.fasta"
93
94 cuff_df = pd.read_csv(inputName+".cuff/genes.fpkm_tracking", sep='\t')
95 cuff_df = cuff_df[(cuff_df['FPKM'] > 0)]
96 cuff_df.to_csv("cuffTest.csv")
97 gene_id_List = cuff_df['gene_id'].tolist()
98
99 #print(gene_id_List)
100 #print ("Found from 8880="+str(found))
101
102 # need to load in IL3000_prot.fasta
103 # for each line with >TcIL3000_1_1940
104 # search within cuff_df[gene_id] for match
105 # add it to the outfile. (need to save it as used by hmmer later
106 number = 0
107 all = 0
108 with open(inputName+"_6frame.fas", 'w') as outfile:
109 ref = open(refName,'r')
110 #ref = open(r"Reference/IL3000_prot.fasta",'r')
111 n = 0
112 line = ref.readline()
113 while line:
114 if line[0] == '>':
115 all = all+1
116 ln = line[1:] #remove >
117 ln = ln.rstrip() #remove /n /r etc
118 #print (ln)
119 if ln in gene_id_List:
120 number = number+1
121 outfile.write(line)
122 line = ref.readline()
123 if line:
124 while line[0] != '>':
125 outfile.write(line)
126 line=ref.readline()
127 else:
128 line = ref.readline()
129 else:
130 line =ref.readline()
131 ref.close()
132 print(str(len(gene_id_List))+":"+str(number)+" from "+str(all))
133 return cuff_df
134
135 def HMMerMotifSearch(name, strain, cuff_df):
136 motifs = ['1', '2a', '2b', '3', '4a', '4b', '4c', '5', '6', '7', '8a', '8b', '9a', '9b',
137 '9c', '10a', '10b', '11a', '11b', '12', '13a', '13b', '13c', '13d', '14', '15a', '15b', '15c']
138 dir_path = os.path.dirname(os.path.realpath(__file__))
139 phylopath = dir_path + "/data/Motifs/Phylotype"
140 lineCounts = []
141 compoundList = []
142 for m in motifs:
143 argString = "hmmsearch "+phylopath + m + ".hmm " + name + "_6frame.fas > Phy" + m + ".out"
144 print(argString)
145 subprocess.call(argString, shell=True)
146 hmmResult = open("Phy" + m + ".out", 'r')
147 regex = r"Tc148[0-9]{1,8}"
148 if strain == "Tc148":
149 regex = r"Tc148[0-9]{1,8}"
150 if strain == "IL3000":
151 regex = r"TcIL3000_[0-9]{1,4}_[0-9]{1,5}"
152 n = 0
153 outList = []
154 for line in hmmResult:
155 m = re.search(regex, line)
156 if m:
157 outList.append(""+m.group())
158 n += 1
159 if re.search(r"inclusion", line):
160 print("inclusion threshold reached")
161 break
162 compoundList.append(outList)
163 lineCounts.append(n)
164 hmmResult.close()
165 #print(lineCounts)
166
167 #print(cuff_df)
168 concatGroups = [1, 2, 1, 3, 1, 1, 1, 2, 3, 2, 2, 1, 4, 1, 3]
169 countList = []
170 weightList = []
171 countIndex = 0
172 totalCount = 0
173 totalWeigth = 0
174 for c in concatGroups:
175 a = []
176 weight = []
177 for n in range(0, c):
178 a = a + compoundList.pop(0)
179 t = set(a)
180 countList.append(len(t))
181 wa = 0
182 for w in t:
183 wt = cuff_df.loc[cuff_df['gene_id'] == w, 'FPKM'].iloc[0]
184 #print(w)
185 #print(wt)
186 wa = wa+wt
187 weightList.append(wa)
188 totalWeigth+=wa
189 totalCount += len(t)
190 countList.append(totalCount)
191 weightList.append(totalWeigth)
192 #print(countList)
193 #print("--------")
194 #print(weightList)
195 #print("--------")
196 return countList,weightList
197
198 def relativeFrequencyTable(countList, name, htmlresource):
199 relFreqList = []
200 c = float(countList[15])
201 for i in range(0, 15):
202 relFreqList.append(countList[i] / c)
203
204 data = {'Phylotype': pList, 'Relative Frequency': relFreqList}
205 relFreq_df = pd.DataFrame(data)
206 j_fname = htmlresource+ "/" + name + "_t_relative_frequency.csv"
207 relFreq_df.to_csv(j_fname)
208 return relFreqList # 0-14 = p1-p15 counts [15] = total counts
209
210
211 def weightedFrequencyTable(countList, name, htmlresource):
212 relFreqList = []
213 c = float(countList[15])
214 for i in range(0, 15):
215 relFreqList.append(countList[i] / c)
216
217 data = {'Phylotype': pList, 'Weighted Frequency': relFreqList}
218 relFreq_df = pd.DataFrame(data)
219 j_fname = htmlresource+ "/" + name + "_t_weighted_frequency.csv"
220 relFreq_df.to_csv(j_fname)
221 return relFreqList # 0-14 = p1-p15 counts [15] = total counts
222
223
224
225 def createStackedBar(name,freqList,strain,pdf,html_resource):
226 palette = ["#0000ff", "#6495ed", "#00ffff", "#caff70",
227 "#228b22", "#528b8b", "#00ff00", "#a52a2a",
228 "#ff0000", "#ffff00", "#ffa500", "#ff1493",
229 "#9400d3", "#bebebe", "#000000", "#ff00ff"]
230
231 VAP_148 = [0.072, 0.032, 0.032, 0.004, 0.007,
232 0.005, 0.202, 0.004, 0.006, 0.014,
233 0.130, 0.133, 0.054, 0.039, 0.265]
234
235 VAP_IL3000 = [0.073, 0.040, 0.049, 0.018, 0.060,
236 0.055, 0.054, 0.025, 0.012, 0.060,
237 0.142, 0.100, 0.061, 0.078, 0.172]
238 cmap = plt.cm.get_cmap('tab20')
239 palette = [cmap(i) for i in range(cmap.N)]
240
241 if strain == "Tc148":
242 VAPtable = VAP_148
243 VAPname='Tc148\nGenome VAP'
244 if strain == "IL3000":
245 VAPtable = VAP_IL3000
246 VAPname= 'IL3000\nGenome VAP'
247 width = 0.35 # the width of the bars: can also be len(x) sequence
248 plots = []
249 fpos = 0
250 vpos = 0
251 for p in range(0, 15):
252 tp = plt.bar(0, freqList[p], width, color= palette[p], bottom = fpos)
253 fpos +=freqList[p]
254
255 tp = plt.bar(1, VAPtable[p], width, color= palette[p], bottom = vpos)
256 vpos +=VAPtable[p]
257
258 plots.append(tp)
259 plt.xticks([0,1],[name,VAPname])
260 plt.legend(plots[::-1],['p15','p14','p13','p12','p11','p10','p9','p8','p7','p6','p5','p4','p3','p2','p1'])
261 title = "Figure Legend: The transcriptomic Variant Antigen Profile of $\itTrypanosoma$ $\itcongolense$ estimated as phylotype " \
262 "proportion adjusted for transcript abundance and the reference genomic Variant Antigen Profile. " \
263 "\nData was produced with the 'Variant Antigen Profiler' (Silva Pereira and Jackson, 2018)."
264 #plt.title(title, wrap="True")
265 #plt.text(-0.2, -0.05, title, va="top", transform=ax.transAxes, wrap="True")
266 plt.text(-0.3, -0.15, title, va="top", wrap="True")
267 plt.tight_layout(pad=1.5)
268 plt.subplots_adjust(bottom = 0.3,top=0.99,left=0.125,right=0.9,hspace=0.2,wspace=0.2)
269
270 plt.savefig(html_resource + "/stackedbar.png")
271 if pdf == 'PDF_Yes':
272 plt.savefig(html_resource + "/stackedbar.pdf")
273 #plt.show()
274
275
276 def createHTML(name,htmlfn,htmlresource,freqList,weightList):
277 #assumes imgs are heatmap.png, dheatmap.png, vapPCA.png and already in htmlresource
278 htmlString = r"<html><title>T.congolense VAP</title><body><div style='text-align:center'><h2><i>Trypanosoma congolense</i> Variant Antigen Profile</h2><h3>"
279 htmlString += name
280 htmlString += r"<br>Transcriptomic Analysis</h3></p>"
281 htmlString += "<p style = 'margin-left:20%; margin-right:20%'>Table Legend: Variant Antigen Profiles of a transcriptome of <i>Trypanosoma congolense</i> estimated as phylotype proportion. " \
282 "Weighted frequency refers to the phylotype proportion based transcript abundance. " \
283 "Data was produced with the 'Variant Antigen Profiler' (Silva Pereira and Jackson, 2018).</p> "
284 htmlString += r"<style> table, th, tr, td {border: 1px solid black; border-collapse: collapse;}</style>"
285
286 htmlString += r"<table style='width:50%;margin-left:25%;text-align:center'><tr><th>Phylotype</th><th>Relative Frequency</th><th>Weighted Frequency</th></tr>"
287 tabString = ""
288 # flush out table with correct values
289 for i in range(0, 15):
290 f = format(freqList[i], '.4f')
291 w = format(weightList[i], '.4f')
292 tabString += "<tr><td>phy" + str(i + 1) + "</td><td>" + f + "</td><td>" + w + "</td></tr>"
293 htmlString += tabString + "</table><br><br><br><br><br>"
294 htmlString += r"<p> <h3>Stacked Bar chart of Phylotype Frequency</h3> The 'weighted' relative frequency of each phylotype alongside the VAP of selected strain.</p>"
295 imgString = r"<img src = 'stackedbar.png' alt='Stacked bar chart of phylotype variation' style='max-width:100%'><br><br>"
296 htmlString += imgString
297
298 # htmlString += r"<p><h3>The Deviation Heat Map and Dendogram</h3>The phylotype variation expressed as the deviation from your sample mean compared to the model dataset</p>"
299 # imgString = r"<img src = 'dheatmap.png' alt='Deviation Heatmap' style='max-width:100%'><br><br>"
300 # htmlString += imgString
301
302 # htmlString += r"<p><h3>The Variation PCA plot</h3>PCA analysis corresponding to absolute variation. Colour coded according to location</p>"
303 # imgString = r"<img src = 'vapPCA.png' alt='PCA Analysis' style='max-width:100%'><br><br>"
304 # htmlString += imgString + r"</div></body></html>"
305
306 with open(htmlfn, "w") as htmlfile:
307 htmlfile.write(htmlString)
308
309 #argdict = {'name':2, 'pdfexport': 3, 'strain': 4, 'forward': 5, 'reverse': 6, 'html_file': 7, 'html_resource': 8}
310 def transcriptomicProcess(args,dict):
311 transcriptMapping(args[dict['name']], args[dict['strain']], args[dict['forward']], args[dict['reverse']]) #uses bowtie
312 processSamFiles(args[dict['name']]) #uses samtools
313 transcriptAbundance(args[dict['name']],args[dict['strain']]) #uses cufflinks -> ?.cuff/*.*
314 cuff_df = convertToFasta(args[dict['name']],args[dict['strain']])
315 countList, weightList = HMMerMotifSearch(args[dict['name']],args[dict['strain']], cuff_df)
316 relFreqList = relativeFrequencyTable(countList,args[dict['name']],args[dict['html_resource']])
317 relWeightList = weightedFrequencyTable(weightList,args[dict['name']],args[dict['html_resource']])
318 createStackedBar(args[dict['name']],relWeightList, args[dict['strain']],args[dict['pdfexport']],args[dict['html_resource']])
319 createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']], relFreqList, relWeightList)
320
321 if __name__ == "__main__":
322 #print("Commencing Transcript Mapping")
323 #transcriptMapping("T_Test", "Transcripts.1","Transcripts.2")
324 #print("Processimg Sam Files")
325 #processSamFiles("T_Test")
326 #print("Assessing Transcript Abundance")
327 #transcriptAbundance("T_Test")
328 #print ("Converting to Fasta Subset")
329 #cuff_df = convertToFasta("T_Test")
330 #print("Commencing HMMer search")
331 #countList, weightList = HMMerMotifSearch("T_Test",cuff_df)
332 #relativeFrequencyTable(countList,'T_Test')
333 #weightedFrequencyTable(weightList,'T_Test')
334 relFreqList = [0.111842105,0.059210526,0.026315789,0.013157895,
335 0.006578947,0.013157895,0.032894737,0.019736842,
336 0.039473684,0.046052632,0.217105263,0.065789474,
337 0.151315789,0.059210526,0.138157895]
338
339 relWeightList = [0.07532571,0.05900545,0.009601452,0.042357532,0.01236219,0.001675663,0.04109726,
340 0.097464248,0.057491666,0.05826875,0.279457473,0.070004772,0.065329007,0.085361298,0.045197529]
341
342 createStackedBar('T_Test',relWeightList, 'Tc148','PDF_Yes','results')
343 createHTML("t_test","results/t_test.html","results",relFreqList,relWeightList)