Mercurial > repos > johnheap > vapper
comparison Tryp_T.py @ 0:36cb22bd911d draft
planemo upload for repository https://github.com/johnheap/VAPPER-Galaxy
| author | johnheap |
|---|---|
| date | Wed, 04 Jul 2018 16:39:13 -0400 |
| parents | |
| children | 8f6469ffef85 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:36cb22bd911d |
|---|---|
| 1 """ | |
| 2 * Copyright 2018 University of Liverpool | |
| 3 * Author: John Heap, Computational Biology Facility, UoL | |
| 4 * Based on original scripts of Sara Silva Pereira, Institute of Infection and Global Health, UoL | |
| 5 * | |
| 6 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 * you may not use this file except in compliance with the License. | |
| 8 * You may obtain a copy of the License at | |
| 9 * | |
| 10 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 * | |
| 12 * Unless required by applicable law or agreed to in writing, software | |
| 13 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 * See the License for the specific language governing permissions and | |
| 16 * limitations under the License. | |
| 17 * | |
| 18 """ | |
| 19 | |
| 20 | |
| 21 import subprocess | |
| 22 import pandas as pd | |
| 23 import re | |
| 24 import os | |
| 25 import sys | |
| 26 import matplotlib as mpl | |
| 27 mpl.use('Agg') | |
| 28 import matplotlib.pyplot as plt | |
| 29 | |
| 30 pList = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15'] | |
| 31 quietString = "" #"">> Vap_log.txt 2>&1" | |
| 32 def transcriptMapping(inputname, strain, forwardFN,reverseFN): | |
| 33 #where is our Reference data - | |
| 34 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| 35 refName = dir_path+"/data/Reference/Tc148" #default | |
| 36 if strain == "Tc148": | |
| 37 refName = dir_path+"/data/Reference/Tc148" | |
| 38 if strain == "IL3000": | |
| 39 refName = dir_path+"/data/Reference/IL3000" | |
| 40 #argString = "bowtie2 -x Refe4rence/IL3000 -1 data/"+forwardFN+" -2 data/"+reverseFN+" -S "+inputname+".sam" #>log.txt | |
| 41 #argString = "bowtie2 -x Reference/Tc148 -1 data/"+forwardFN+" -2 data/"+reverseFN+" -S "+inputname+".sam" #>log.txt | |
| 42 argString = "bowtie2 -x "+refName+" -1 "+forwardFN+" -2 "+reverseFN+" -S "+inputname+".sam"+quietString #>log.txt | |
| 43 #print(argString) | |
| 44 returncode = subprocess.call(argString, shell=True) | |
| 45 | |
| 46 def processSamFiles(inputname): | |
| 47 #debug use a mapping sam file we have already found | |
| 48 #dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| 49 #bugName = dir_path+"/data/T_Test" #defasult | |
| 50 | |
| 51 cur_path = os.getcwd() | |
| 52 samName = cur_path+"/"+inputname | |
| 53 | |
| 54 #argString = "samtools view -bS "+bugName+" > "+inputname+".bam" | |
| 55 argString = "samtools view -bS "+inputname+".sam > "+samName+".bam"+quietString | |
| 56 #print(argString) | |
| 57 returncode = subprocess.call(argString, shell=True) | |
| 58 | |
| 59 | |
| 60 #argString = "samtools sort "+bugName+" -o "+inputname+".sorted" | |
| 61 argString = "samtools sort "+samName+".bam -o "+samName+".sorted"+quietString | |
| 62 #print("argstring = "+argString) | |
| 63 returncode = subprocess.call(argString, shell=True) | |
| 64 | |
| 65 #argString = "samtools index "+bugName+".sorted "+inputname+".sorted.bai" | |
| 66 argString = "samtools index "+samName+".sorted "+samName+".sorted.bai"+quietString | |
| 67 #print("argstring = " + argString) | |
| 68 returncode = subprocess.call(argString, shell=True) | |
| 69 | |
| 70 | |
| 71 | |
| 72 | |
| 73 def transcriptAbundance(inputname, strain): | |
| 74 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| 75 refName = dir_path + "/data/Reference/ORFAnnotation.gtf" # defasult | |
| 76 if strain == "Tc148": | |
| 77 refName = dir_path + "/data/Reference/ORFAnnotation.gtf" | |
| 78 if strain == "IL3000": | |
| 79 refName = dir_path + "/data/Reference/IL3000.gtf" | |
| 80 #argString = "cufflinks -G Reference/IL3000.gtf -o "+inputname+".cuff -u -p 8 "+inputname+".sorted" | |
| 81 #argString = "cufflinks -G Reference/ORFAnnotation.gtf -o "+inputname+".cuff -u -p 8 "+inputname+".sorted" | |
| 82 argString = "cufflinks -q -G "+refName+" -o "+inputname+".cuff -u -p 8 "+inputname+".sorted"+quietString | |
| 83 returncode = subprocess.call(argString, shell = True) | |
| 84 | |
| 85 | |
| 86 def convertToFasta(inputName, strain): #equivalent to Sara's awk scripte | |
| 87 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| 88 refName = dir_path + "/data/Reference/ORFAnnotation.gtf" # default | |
| 89 if strain == "Tc148": | |
| 90 refName = dir_path + "/data/Reference/148_prot.fasta" | |
| 91 if strain == "IL3000": | |
| 92 refName = dir_path + "data/Reference/IL3000_prot.fasta" | |
| 93 | |
| 94 cuff_df = pd.read_csv(inputName+".cuff/genes.fpkm_tracking", sep='\t') | |
| 95 cuff_df = cuff_df[(cuff_df['FPKM'] > 0)] | |
| 96 cuff_df.to_csv("cuffTest.csv") | |
| 97 gene_id_List = cuff_df['gene_id'].tolist() | |
| 98 | |
| 99 #print(gene_id_List) | |
| 100 #print ("Found from 8880="+str(found)) | |
| 101 | |
| 102 # need to load in IL3000_prot.fasta | |
| 103 # for each line with >TcIL3000_1_1940 | |
| 104 # search within cuff_df[gene_id] for match | |
| 105 # add it to the outfile. (need to save it as used by hmmer later | |
| 106 number = 0 | |
| 107 all = 0 | |
| 108 with open(inputName+"_6frame.fas", 'w') as outfile: | |
| 109 ref = open(refName,'r') | |
| 110 #ref = open(r"Reference/IL3000_prot.fasta",'r') | |
| 111 n = 0 | |
| 112 line = ref.readline() | |
| 113 while line: | |
| 114 if line[0] == '>': | |
| 115 all = all+1 | |
| 116 ln = line[1:] #remove > | |
| 117 ln = ln.rstrip() #remove /n /r etc | |
| 118 #print (ln) | |
| 119 if ln in gene_id_List: | |
| 120 number = number+1 | |
| 121 outfile.write(line) | |
| 122 line = ref.readline() | |
| 123 if line: | |
| 124 while line[0] != '>': | |
| 125 outfile.write(line) | |
| 126 line=ref.readline() | |
| 127 else: | |
| 128 line = ref.readline() | |
| 129 else: | |
| 130 line =ref.readline() | |
| 131 ref.close() | |
| 132 print(str(len(gene_id_List))+":"+str(number)+" from "+str(all)) | |
| 133 return cuff_df | |
| 134 | |
| 135 def HMMerMotifSearch(name, strain, cuff_df): | |
| 136 motifs = ['1', '2a', '2b', '3', '4a', '4b', '4c', '5', '6', '7', '8a', '8b', '9a', '9b', | |
| 137 '9c', '10a', '10b', '11a', '11b', '12', '13a', '13b', '13c', '13d', '14', '15a', '15b', '15c'] | |
| 138 dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| 139 phylopath = dir_path + "/data/Motifs/Phylotype" | |
| 140 lineCounts = [] | |
| 141 compoundList = [] | |
| 142 for m in motifs: | |
| 143 argString = "hmmsearch "+phylopath + m + ".hmm " + name + "_6frame.fas > Phy" + m + ".out" | |
| 144 print(argString) | |
| 145 subprocess.call(argString, shell=True) | |
| 146 hmmResult = open("Phy" + m + ".out", 'r') | |
| 147 regex = r"Tc148[0-9]{1,8}" | |
| 148 if strain == "Tc148": | |
| 149 regex = r"Tc148[0-9]{1,8}" | |
| 150 if strain == "IL3000": | |
| 151 regex = r"TcIL3000_[0-9]{1,4}_[0-9]{1,5}" | |
| 152 n = 0 | |
| 153 outList = [] | |
| 154 for line in hmmResult: | |
| 155 m = re.search(regex, line) | |
| 156 if m: | |
| 157 outList.append(""+m.group()) | |
| 158 n += 1 | |
| 159 if re.search(r"inclusion", line): | |
| 160 print("inclusion threshold reached") | |
| 161 break | |
| 162 compoundList.append(outList) | |
| 163 lineCounts.append(n) | |
| 164 hmmResult.close() | |
| 165 #print(lineCounts) | |
| 166 | |
| 167 #print(cuff_df) | |
| 168 concatGroups = [1, 2, 1, 3, 1, 1, 1, 2, 3, 2, 2, 1, 4, 1, 3] | |
| 169 countList = [] | |
| 170 weightList = [] | |
| 171 countIndex = 0 | |
| 172 totalCount = 0 | |
| 173 totalWeigth = 0 | |
| 174 for c in concatGroups: | |
| 175 a = [] | |
| 176 weight = [] | |
| 177 for n in range(0, c): | |
| 178 a = a + compoundList.pop(0) | |
| 179 t = set(a) | |
| 180 countList.append(len(t)) | |
| 181 wa = 0 | |
| 182 for w in t: | |
| 183 wt = cuff_df.loc[cuff_df['gene_id'] == w, 'FPKM'].iloc[0] | |
| 184 #print(w) | |
| 185 #print(wt) | |
| 186 wa = wa+wt | |
| 187 weightList.append(wa) | |
| 188 totalWeigth+=wa | |
| 189 totalCount += len(t) | |
| 190 countList.append(totalCount) | |
| 191 weightList.append(totalWeigth) | |
| 192 #print(countList) | |
| 193 #print("--------") | |
| 194 #print(weightList) | |
| 195 #print("--------") | |
| 196 return countList,weightList | |
| 197 | |
| 198 def relativeFrequencyTable(countList, name, htmlresource): | |
| 199 relFreqList = [] | |
| 200 c = float(countList[15]) | |
| 201 for i in range(0, 15): | |
| 202 relFreqList.append(countList[i] / c) | |
| 203 | |
| 204 data = {'Phylotype': pList, 'Relative Frequency': relFreqList} | |
| 205 relFreq_df = pd.DataFrame(data) | |
| 206 j_fname = htmlresource+ "/" + name + "_t_relative_frequency.csv" | |
| 207 relFreq_df.to_csv(j_fname) | |
| 208 return relFreqList # 0-14 = p1-p15 counts [15] = total counts | |
| 209 | |
| 210 | |
| 211 def weightedFrequencyTable(countList, name, htmlresource): | |
| 212 relFreqList = [] | |
| 213 c = float(countList[15]) | |
| 214 for i in range(0, 15): | |
| 215 relFreqList.append(countList[i] / c) | |
| 216 | |
| 217 data = {'Phylotype': pList, 'Weighted Frequency': relFreqList} | |
| 218 relFreq_df = pd.DataFrame(data) | |
| 219 j_fname = htmlresource+ "/" + name + "_t_weighted_frequency.csv" | |
| 220 relFreq_df.to_csv(j_fname) | |
| 221 return relFreqList # 0-14 = p1-p15 counts [15] = total counts | |
| 222 | |
| 223 | |
| 224 | |
| 225 def createStackedBar(name,freqList,strain,pdf,html_resource): | |
| 226 palette = ["#0000ff", "#6495ed", "#00ffff", "#caff70", | |
| 227 "#228b22", "#528b8b", "#00ff00", "#a52a2a", | |
| 228 "#ff0000", "#ffff00", "#ffa500", "#ff1493", | |
| 229 "#9400d3", "#bebebe", "#000000", "#ff00ff"] | |
| 230 | |
| 231 VAP_148 = [0.072, 0.032, 0.032, 0.004, 0.007, | |
| 232 0.005, 0.202, 0.004, 0.006, 0.014, | |
| 233 0.130, 0.133, 0.054, 0.039, 0.265] | |
| 234 | |
| 235 VAP_IL3000 = [0.073, 0.040, 0.049, 0.018, 0.060, | |
| 236 0.055, 0.054, 0.025, 0.012, 0.060, | |
| 237 0.142, 0.100, 0.061, 0.078, 0.172] | |
| 238 cmap = plt.cm.get_cmap('tab20') | |
| 239 palette = [cmap(i) for i in range(cmap.N)] | |
| 240 | |
| 241 if strain == "Tc148": | |
| 242 VAPtable = VAP_148 | |
| 243 VAPname='Tc148\nGenome VAP' | |
| 244 if strain == "IL3000": | |
| 245 VAPtable = VAP_IL3000 | |
| 246 VAPname= 'IL3000\nGenome VAP' | |
| 247 width = 0.35 # the width of the bars: can also be len(x) sequence | |
| 248 plots = [] | |
| 249 fpos = 0 | |
| 250 vpos = 0 | |
| 251 for p in range(0, 15): | |
| 252 tp = plt.bar(0, freqList[p], width, color= palette[p], bottom = fpos) | |
| 253 fpos +=freqList[p] | |
| 254 | |
| 255 tp = plt.bar(1, VAPtable[p], width, color= palette[p], bottom = vpos) | |
| 256 vpos +=VAPtable[p] | |
| 257 | |
| 258 plots.append(tp) | |
| 259 plt.xticks([0,1],[name,VAPname]) | |
| 260 plt.legend(plots[::-1],['p15','p14','p13','p12','p11','p10','p9','p8','p7','p6','p5','p4','p3','p2','p1']) | |
| 261 title = "Figure Legend: The transcriptomic Variant Antigen Profile of $\itTrypanosoma$ $\itcongolense$ estimated as phylotype " \ | |
| 262 "proportion adjusted for transcript abundance and the reference genomic Variant Antigen Profile. " \ | |
| 263 "\nData was produced with the 'Variant Antigen Profiler' (Silva Pereira and Jackson, 2018)." | |
| 264 #plt.title(title, wrap="True") | |
| 265 #plt.text(-0.2, -0.05, title, va="top", transform=ax.transAxes, wrap="True") | |
| 266 plt.text(-0.3, -0.15, title, va="top", wrap="True") | |
| 267 plt.tight_layout(pad=1.5) | |
| 268 plt.subplots_adjust(bottom = 0.3,top=0.99,left=0.125,right=0.9,hspace=0.2,wspace=0.2) | |
| 269 | |
| 270 plt.savefig(html_resource + "/stackedbar.png") | |
| 271 if pdf == 'PDF_Yes': | |
| 272 plt.savefig(html_resource + "/stackedbar.pdf") | |
| 273 #plt.show() | |
| 274 | |
| 275 | |
| 276 def createHTML(name,htmlfn,htmlresource,freqList,weightList): | |
| 277 #assumes imgs are heatmap.png, dheatmap.png, vapPCA.png and already in htmlresource | |
| 278 htmlString = r"<html><title>T.congolense VAP</title><body><div style='text-align:center'><h2><i>Trypanosoma congolense</i> Variant Antigen Profile</h2><h3>" | |
| 279 htmlString += name | |
| 280 htmlString += r"<br>Transcriptomic Analysis</h3></p>" | |
| 281 htmlString += "<p style = 'margin-left:20%; margin-right:20%'>Table Legend: Variant Antigen Profiles of a transcriptome of <i>Trypanosoma congolense</i> estimated as phylotype proportion. " \ | |
| 282 "Weighted frequency refers to the phylotype proportion based transcript abundance. " \ | |
| 283 "Data was produced with the 'Variant Antigen Profiler' (Silva Pereira and Jackson, 2018).</p> " | |
| 284 htmlString += r"<style> table, th, tr, td {border: 1px solid black; border-collapse: collapse;}</style>" | |
| 285 | |
| 286 htmlString += r"<table style='width:50%;margin-left:25%;text-align:center'><tr><th>Phylotype</th><th>Relative Frequency</th><th>Weighted Frequency</th></tr>" | |
| 287 tabString = "" | |
| 288 # flush out table with correct values | |
| 289 for i in range(0, 15): | |
| 290 f = format(freqList[i], '.4f') | |
| 291 w = format(weightList[i], '.4f') | |
| 292 tabString += "<tr><td>phy" + str(i + 1) + "</td><td>" + f + "</td><td>" + w + "</td></tr>" | |
| 293 htmlString += tabString + "</table><br><br><br><br><br>" | |
| 294 htmlString += r"<p> <h3>Stacked Bar chart of Phylotype Frequency</h3> The 'weighted' relative frequency of each phylotype alongside the VAP of selected strain.</p>" | |
| 295 imgString = r"<img src = 'stackedbar.png' alt='Stacked bar chart of phylotype variation' style='max-width:100%'><br><br>" | |
| 296 htmlString += imgString | |
| 297 | |
| 298 # htmlString += r"<p><h3>The Deviation Heat Map and Dendogram</h3>The phylotype variation expressed as the deviation from your sample mean compared to the model dataset</p>" | |
| 299 # imgString = r"<img src = 'dheatmap.png' alt='Deviation Heatmap' style='max-width:100%'><br><br>" | |
| 300 # htmlString += imgString | |
| 301 | |
| 302 # htmlString += r"<p><h3>The Variation PCA plot</h3>PCA analysis corresponding to absolute variation. Colour coded according to location</p>" | |
| 303 # imgString = r"<img src = 'vapPCA.png' alt='PCA Analysis' style='max-width:100%'><br><br>" | |
| 304 # htmlString += imgString + r"</div></body></html>" | |
| 305 | |
| 306 with open(htmlfn, "w") as htmlfile: | |
| 307 htmlfile.write(htmlString) | |
| 308 | |
| 309 #argdict = {'name':2, 'pdfexport': 3, 'strain': 4, 'forward': 5, 'reverse': 6, 'html_file': 7, 'html_resource': 8} | |
| 310 def transcriptomicProcess(args,dict): | |
| 311 transcriptMapping(args[dict['name']], args[dict['strain']], args[dict['forward']], args[dict['reverse']]) #uses bowtie | |
| 312 processSamFiles(args[dict['name']]) #uses samtools | |
| 313 transcriptAbundance(args[dict['name']],args[dict['strain']]) #uses cufflinks -> ?.cuff/*.* | |
| 314 cuff_df = convertToFasta(args[dict['name']],args[dict['strain']]) | |
| 315 countList, weightList = HMMerMotifSearch(args[dict['name']],args[dict['strain']], cuff_df) | |
| 316 relFreqList = relativeFrequencyTable(countList,args[dict['name']],args[dict['html_resource']]) | |
| 317 relWeightList = weightedFrequencyTable(weightList,args[dict['name']],args[dict['html_resource']]) | |
| 318 createStackedBar(args[dict['name']],relWeightList, args[dict['strain']],args[dict['pdfexport']],args[dict['html_resource']]) | |
| 319 createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']], relFreqList, relWeightList) | |
| 320 | |
| 321 if __name__ == "__main__": | |
| 322 #print("Commencing Transcript Mapping") | |
| 323 #transcriptMapping("T_Test", "Transcripts.1","Transcripts.2") | |
| 324 #print("Processimg Sam Files") | |
| 325 #processSamFiles("T_Test") | |
| 326 #print("Assessing Transcript Abundance") | |
| 327 #transcriptAbundance("T_Test") | |
| 328 #print ("Converting to Fasta Subset") | |
| 329 #cuff_df = convertToFasta("T_Test") | |
| 330 #print("Commencing HMMer search") | |
| 331 #countList, weightList = HMMerMotifSearch("T_Test",cuff_df) | |
| 332 #relativeFrequencyTable(countList,'T_Test') | |
| 333 #weightedFrequencyTable(weightList,'T_Test') | |
| 334 relFreqList = [0.111842105,0.059210526,0.026315789,0.013157895, | |
| 335 0.006578947,0.013157895,0.032894737,0.019736842, | |
| 336 0.039473684,0.046052632,0.217105263,0.065789474, | |
| 337 0.151315789,0.059210526,0.138157895] | |
| 338 | |
| 339 relWeightList = [0.07532571,0.05900545,0.009601452,0.042357532,0.01236219,0.001675663,0.04109726, | |
| 340 0.097464248,0.057491666,0.05826875,0.279457473,0.070004772,0.065329007,0.085361298,0.045197529] | |
| 341 | |
| 342 createStackedBar('T_Test',relWeightList, 'Tc148','PDF_Yes','results') | |
| 343 createHTML("t_test","results/t_test.html","results",relFreqList,relWeightList) |
