# HG changeset patch # User johnheap # Date 1559585207 14400 # Node ID 25226d33642e28fb8e6330b76e85ffb3931f63af # Parent e91e413809461698bcd380067eba1afaceeb2554 Uploaded diff -r e91e41380946 -r 25226d33642e Tryp_T.py --- a/Tryp_T.py Mon Jun 03 14:05:47 2019 -0400 +++ b/Tryp_T.py Mon Jun 03 14:06:47 2019 -0400 @@ -1,345 +1,2261 @@ -""" - * Copyright 2018 University of Liverpool - * Author: John Heap, Computational Biology Facility, UoL - * Based on original scripts of Sara Silva Pereira, Institute of Infection and Global Health, UoL - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - """ - - -import subprocess -import pandas as pd -import re -import os -import sys -import matplotlib as mpl -mpl.use('Agg') -import matplotlib.pyplot as plt - -pList = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15'] -quietString = "" #"">> Vap_log.txt 2>&1" -def transcriptMapping(inputname, strain, forwardFN,reverseFN): - #where is our Reference data - - dir_path = os.path.dirname(os.path.realpath(__file__)) - refName = dir_path+"/data/Reference/Tc148" #default - if strain == "Tc148": - refName = dir_path+"/data/Reference/Tc148" - if strain == "IL3000": - refName = dir_path+"/data/Reference/IL3000" - #argString = "bowtie2 -x Refe4rence/IL3000 -1 data/"+forwardFN+" -2 data/"+reverseFN+" -S "+inputname+".sam" #>log.txt - #argString = "bowtie2 -x Reference/Tc148 -1 data/"+forwardFN+" -2 data/"+reverseFN+" -S "+inputname+".sam" #>log.txt - argString = "bowtie2 -x "+refName+" -1 "+forwardFN+" -2 "+reverseFN+" -S "+inputname+".sam"+quietString #>log.txt - #print(argString) - returncode = subprocess.call(argString, shell=True) - -def processSamFiles(inputname): - #debug use a mapping sam file we have already found - #dir_path = os.path.dirname(os.path.realpath(__file__)) - #bugName = dir_path+"/data/T_Test" #defasult - - cur_path = os.getcwd() - samName = cur_path+"/"+inputname - - #argString = "samtools view -bS "+bugName+" > "+inputname+".bam" - argString = "samtools view -bS "+inputname+".sam > "+samName+".bam"+quietString - #print(argString) - returncode = subprocess.call(argString, shell=True) - - - #argString = "samtools sort "+bugName+" -o "+inputname+".sorted" - argString = "samtools sort "+samName+".bam -o "+samName+".sorted"+quietString - #print("argstring = "+argString) - returncode = subprocess.call(argString, shell=True) - - #argString = "samtools index "+bugName+".sorted "+inputname+".sorted.bai" - argString = "samtools index "+samName+".sorted "+samName+".sorted.bai"+quietString - #print("argstring = " + argString) - returncode = subprocess.call(argString, shell=True) - - - - -def transcriptAbundance(inputname, strain): - dir_path = os.path.dirname(os.path.realpath(__file__)) - refName = dir_path + "/data/Reference/ORFAnnotation.gtf" # defasult - if strain == "Tc148": - refName = dir_path + "/data/Reference/ORFAnnotation.gtf" - if strain == "IL3000": - refName = dir_path + "/data/Reference/IL3000.gtf" - #argString = "cufflinks -G Reference/IL3000.gtf -o "+inputname+".cuff -u -p 8 "+inputname+".sorted" - #argString = "cufflinks -G Reference/ORFAnnotation.gtf -o "+inputname+".cuff -u -p 8 "+inputname+".sorted" - argString = "cufflinks -q -G "+refName+" -o "+inputname+".cuff -u -p 8 "+inputname+".sorted"+quietString - returncode = subprocess.call(argString, shell = True) - - -def convertToFasta(inputName, strain): #equivalent to Sara's awk scripte - dir_path = os.path.dirname(os.path.realpath(__file__)) - refName = dir_path + "/data/Reference/ORFAnnotation.gtf" # default - if strain == "Tc148": - refName = dir_path + "/data/Reference/148_prot.fasta" - if strain == "IL3000": - refName = dir_path + "/data/Reference/IL3000_prot.fasta" - - cuff_df = pd.read_csv(inputName+".cuff/genes.fpkm_tracking", sep='\t') - cuff_df = cuff_df[(cuff_df['FPKM'] > 0)] - cuff_df.to_csv("cuffTest.csv") - gene_id_List = cuff_df['gene_id'].tolist() - - #print(gene_id_List) - #print ("Found from 8880="+str(found)) - - # need to load in IL3000_prot.fasta - # for each line with >TcIL3000_1_1940 - # search within cuff_df[gene_id] for match - # add it to the outfile. (need to save it as used by hmmer later - number = 0 - all = 0 - with open(inputName+"_6frame.fas", 'w') as outfile: - ref = open(refName,'r') - #ref = open(r"Reference/IL3000_prot.fasta",'r') - n = 0 - line = ref.readline() - while line: - if line[0] == '>': - all = all+1 - ln = line[1:] #remove > - ln = ln.rstrip() #remove /n /r etc - #print (ln) - if ln in gene_id_List: - number = number+1 - outfile.write(line) - line = ref.readline() - if line: - while line[0] != '>': - outfile.write(line) - line=ref.readline() - if not line: - break; - else: - line = ref.readline() - else: - line =ref.readline() - ref.close() - print(str(len(gene_id_List))+":"+str(number)+" from "+str(all)) - return cuff_df - -def HMMerMotifSearch(name, strain, cuff_df): - motifs = ['1', '2a', '2b', '3', '4a', '4b', '4c', '5', '6', '7', '8a', '8b', '9a', '9b', - '9c', '10a', '10b', '11a', '11b', '12', '13a', '13b', '13c', '13d', '14', '15a', '15b', '15c'] - dir_path = os.path.dirname(os.path.realpath(__file__)) - phylopath = dir_path + "/data/Motifs/Phylotype" - lineCounts = [] - compoundList = [] - for m in motifs: - argString = "hmmsearch "+phylopath + m + ".hmm " + name + "_6frame.fas > Phy" + m + ".out" - print(argString) - subprocess.call(argString, shell=True) - hmmResult = open("Phy" + m + ".out", 'r') - regex = r"Tc148[0-9]{1,8}" - if strain == "Tc148": - regex = r"Tc148[0-9]{1,8}" - if strain == "IL3000": - regex = r"TcIL3000_[0-9]{1,4}_[0-9]{1,5}" - n = 0 - outList = [] - for line in hmmResult: - m = re.search(regex, line) - if m: - outList.append(""+m.group()) - n += 1 - if re.search(r"inclusion", line): - print("inclusion threshold reached") - break - compoundList.append(outList) - lineCounts.append(n) - hmmResult.close() - #print(lineCounts) - - #print(cuff_df) - concatGroups = [1, 2, 1, 3, 1, 1, 1, 2, 3, 2, 2, 1, 4, 1, 3] - countList = [] - weightList = [] - countIndex = 0 - totalCount = 0 - totalWeigth = 0 - for c in concatGroups: - a = [] - weight = [] - for n in range(0, c): - a = a + compoundList.pop(0) - t = set(a) - countList.append(len(t)) - wa = 0 - for w in t: - wt = cuff_df.loc[cuff_df['gene_id'] == w, 'FPKM'].iloc[0] - #print(w) - #print(wt) - wa = wa+wt - weightList.append(wa) - totalWeigth+=wa - totalCount += len(t) - countList.append(totalCount) - weightList.append(totalWeigth) - #print(countList) - #print("--------") - #print(weightList) - #print("--------") - return countList,weightList - -def relativeFrequencyTable(countList, name, htmlresource): - relFreqList = [] - c = float(countList[15]) - for i in range(0, 15): - relFreqList.append(countList[i] / c) - - data = {'Phylotype': pList, 'Relative Frequency': relFreqList} - relFreq_df = pd.DataFrame(data) - j_fname = htmlresource+ "/" + name + "_t_relative_frequency.csv" - relFreq_df.to_csv(j_fname) - return relFreqList # 0-14 = p1-p15 counts [15] = total counts - - -def weightedFrequencyTable(countList, name, htmlresource): - relFreqList = [] - c = float(countList[15]) - for i in range(0, 15): - relFreqList.append(countList[i] / c) - - data = {'Phylotype': pList, 'Weighted Frequency': relFreqList} - relFreq_df = pd.DataFrame(data) - j_fname = htmlresource+ "/" + name + "_t_weighted_frequency.csv" - relFreq_df.to_csv(j_fname) - return relFreqList # 0-14 = p1-p15 counts [15] = total counts - - - -def createStackedBar(name,freqList,strain,pdf,html_resource): - palette = ["#0000ff", "#6495ed", "#00ffff", "#caff70", - "#228b22", "#528b8b", "#00ff00", "#a52a2a", - "#ff0000", "#ffff00", "#ffa500", "#ff1493", - "#9400d3", "#bebebe", "#000000", "#ff00ff"] - - VAP_148 = [0.072, 0.032, 0.032, 0.004, 0.007, - 0.005, 0.202, 0.004, 0.006, 0.014, - 0.130, 0.133, 0.054, 0.039, 0.265] - - VAP_IL3000 = [0.073, 0.040, 0.049, 0.018, 0.060, - 0.055, 0.054, 0.025, 0.012, 0.060, - 0.142, 0.100, 0.061, 0.078, 0.172] - cmap = plt.cm.get_cmap('tab20') - palette = [cmap(i) for i in range(cmap.N)] - - if strain == "Tc148": - VAPtable = VAP_148 - VAPname='Tc148\nGenome VAP' - if strain == "IL3000": - VAPtable = VAP_IL3000 - VAPname= 'IL3000\nGenome VAP' - width = 0.35 # the width of the bars: can also be len(x) sequence - plots = [] - fpos = 0 - vpos = 0 - for p in range(0, 15): - tp = plt.bar(0, freqList[p], width, color= palette[p], bottom = fpos) - fpos +=freqList[p] - - tp = plt.bar(1, VAPtable[p], width, color= palette[p], bottom = vpos) - vpos +=VAPtable[p] - - plots.append(tp) - plt.xticks([0,1],[name,VAPname]) - plt.legend(plots[::-1],['p15','p14','p13','p12','p11','p10','p9','p8','p7','p6','p5','p4','p3','p2','p1']) - title = "Figure Legend: The transcriptomic Variant Antigen Profile of $\itTrypanosoma$ $\itcongolense$ estimated as phylotype " \ - "proportion adjusted for transcript abundance and the reference genomic Variant Antigen Profile. " \ - "\nData was produced with the 'Variant Antigen Profiler' (Silva Pereira and Jackson, 2018)." - #plt.title(title, wrap="True") - #plt.text(-0.2, -0.05, title, va="top", transform=ax.transAxes, wrap="True") - plt.text(-0.3, -0.15, title, va="top", wrap="True") - plt.tight_layout(pad=1.5) - plt.subplots_adjust(bottom = 0.3,top=0.99,left=0.125,right=0.9,hspace=0.2,wspace=0.2) - - plt.savefig(html_resource + "/stackedbar.png") - if pdf == 'PDF_Yes': - plt.savefig(html_resource + "/stackedbar.pdf") - #plt.show() - - -def createHTML(name,htmlfn,htmlresource,freqList,weightList): - #assumes imgs are heatmap.png, dheatmap.png, vapPCA.png and already in htmlresource - htmlString = r"T.congolense VAP

Trypanosoma congolense Variant Antigen Profile

" - htmlString += name - htmlString += r"
Transcriptomic Analysis

" - htmlString += "

Table Legend: Variant Antigen Profiles of a transcriptome of Trypanosoma congolense estimated as phylotype proportion. " \ - "Weighted frequency refers to the phylotype proportion based transcript abundance. " \ - "Data was produced with the 'Variant Antigen Profiler' (Silva Pereira and Jackson, 2018).

" - htmlString += r"" - - htmlString += r"" - tabString = "" - # flush out table with correct values - for i in range(0, 15): - f = format(freqList[i], '.4f') - w = format(weightList[i], '.4f') - tabString += "" - htmlString += tabString + "
PhylotypeRelative FrequencyWeighted Frequency
phy" + str(i + 1) + "" + f + "" + w + "





" - htmlString += r"

Stacked Bar chart of Phylotype Frequency

The 'weighted' relative frequency of each phylotype alongside the VAP of selected strain.

" - imgString = r"Stacked bar chart of phylotype variation

" - htmlString += imgString - -# htmlString += r"

The Deviation Heat Map and Dendogram

The phylotype variation expressed as the deviation from your sample mean compared to the model dataset

" -# imgString = r"Deviation Heatmap

" -# htmlString += imgString - -# htmlString += r"

The Variation PCA plot

PCA analysis corresponding to absolute variation. Colour coded according to location

" -# imgString = r"PCA Analysis

" -# htmlString += imgString + r"
" - - with open(htmlfn, "w") as htmlfile: - htmlfile.write(htmlString) - -#argdict = {'name':2, 'pdfexport': 3, 'strain': 4, 'forward': 5, 'reverse': 6, 'html_file': 7, 'html_resource': 8} -def transcriptomicProcess(args,dict): - transcriptMapping(args[dict['name']], args[dict['strain']], args[dict['forward']], args[dict['reverse']]) #uses bowtie - processSamFiles(args[dict['name']]) #uses samtools - transcriptAbundance(args[dict['name']],args[dict['strain']]) #uses cufflinks -> ?.cuff/*.* - cuff_df = convertToFasta(args[dict['name']],args[dict['strain']]) - countList, weightList = HMMerMotifSearch(args[dict['name']],args[dict['strain']], cuff_df) - relFreqList = relativeFrequencyTable(countList,args[dict['name']],args[dict['html_resource']]) - relWeightList = weightedFrequencyTable(weightList,args[dict['name']],args[dict['html_resource']]) - createStackedBar(args[dict['name']],relWeightList, args[dict['strain']],args[dict['pdfexport']],args[dict['html_resource']]) - createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']], relFreqList, relWeightList) - -if __name__ == "__main__": - #print("Commencing Transcript Mapping") - #transcriptMapping("T_Test", "Transcripts.1","Transcripts.2") - #print("Processimg Sam Files") - #processSamFiles("T_Test") - #print("Assessing Transcript Abundance") - #transcriptAbundance("T_Test") - #print ("Converting to Fasta Subset") - #cuff_df = convertToFasta("T_Test") - #print("Commencing HMMer search") - #countList, weightList = HMMerMotifSearch("T_Test",cuff_df) - #relativeFrequencyTable(countList,'T_Test') - #weightedFrequencyTable(weightList,'T_Test') - relFreqList = [0.111842105,0.059210526,0.026315789,0.013157895, - 0.006578947,0.013157895,0.032894737,0.019736842, - 0.039473684,0.046052632,0.217105263,0.065789474, - 0.151315789,0.059210526,0.138157895] - - relWeightList = [0.07532571,0.05900545,0.009601452,0.042357532,0.01236219,0.001675663,0.04109726, - 0.097464248,0.057491666,0.05826875,0.279457473,0.070004772,0.065329007,0.085361298,0.045197529] - - createStackedBar('T_Test',relWeightList, 'Tc148','PDF_Yes','results') - createHTML("t_test","results/t_test.html","results",relFreqList,relWeightList) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + VAPPER-Galaxy/Tryp_T.py at master · johnheap/VAPPER-Galaxy · GitHub + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Skip to content +
+ + + + + + + + +
+ +
+ + +
+ +
+ + + +
+
+
+ + + + + + + + + + + +
+
+ + + + + + + + Permalink + + + + + +
+ + +
+ + Branch: + master + + + + + + + +
+ +
+ + Find file + + + Copy path + +
+
+ + +
+ + Find file + + + Copy path + +
+
+ + + + +
+ Fetching contributors… +
+ +
+ + Cannot retrieve contributors at this time +
+
+ + + + + +
+ +
+ +
+ 346 lines (301 sloc) + + 15 KB +
+ +
+ +
+ Raw + Blame + History +
+ + +
+ + + +
+
+
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
"""
* Copyright 2018 University of Liverpool
* Author: John Heap, Computational Biology Facility, UoL
* Based on original scripts of Sara Silva Pereira, Institute of Infection and Global Health, UoL
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
"""
+
+
import subprocess
import pandas as pd
import re
import os
import sys
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
+
pList = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15']
quietString = "" #"">> Vap_log.txt 2>&1"
def transcriptMapping(inputname, strain, forwardFN,reverseFN):
#where is our Reference data -
dir_path = os.path.dirname(os.path.realpath(__file__))
refName = dir_path+"/data/Reference/Tc148" #default
if strain == "Tc148":
refName = dir_path+"/data/Reference/Tc148"
if strain == "IL3000":
refName = dir_path+"/data/Reference/IL3000"
#argString = "bowtie2 -x Refe4rence/IL3000 -1 data/"+forwardFN+" -2 data/"+reverseFN+" -S "+inputname+".sam" #>log.txt
#argString = "bowtie2 -x Reference/Tc148 -1 data/"+forwardFN+" -2 data/"+reverseFN+" -S "+inputname+".sam" #>log.txt
argString = "bowtie2 -x "+refName+" -1 "+forwardFN+" -2 "+reverseFN+" -S "+inputname+".sam"+quietString #>log.txt
#print(argString)
returncode = subprocess.call(argString, shell=True)
+
def processSamFiles(inputname):
#debug use a mapping sam file we have already found
#dir_path = os.path.dirname(os.path.realpath(__file__))
#bugName = dir_path+"/data/T_Test" #defasult
+
cur_path = os.getcwd()
samName = cur_path+"/"+inputname
+
#argString = "samtools view -bS "+bugName+" > "+inputname+".bam"
argString = "samtools view -bS "+inputname+".sam > "+samName+".bam"+quietString
#print(argString)
returncode = subprocess.call(argString, shell=True)
+
+
#argString = "samtools sort "+bugName+" -o "+inputname+".sorted"
argString = "samtools sort "+samName+".bam -o "+samName+".sorted"+quietString
#print("argstring = "+argString)
returncode = subprocess.call(argString, shell=True)
+
#argString = "samtools index "+bugName+".sorted "+inputname+".sorted.bai"
argString = "samtools index "+samName+".sorted "+samName+".sorted.bai"+quietString
#print("argstring = " + argString)
returncode = subprocess.call(argString, shell=True)
+
+
+
+
def transcriptAbundance(inputname, strain):
dir_path = os.path.dirname(os.path.realpath(__file__))
refName = dir_path + "/data/Reference/ORFAnnotation.gtf" # defasult
if strain == "Tc148":
refName = dir_path + "/data/Reference/ORFAnnotation.gtf"
if strain == "IL3000":
refName = dir_path + "/data/Reference/IL3000.gtf"
#argString = "cufflinks -G Reference/IL3000.gtf -o "+inputname+".cuff -u -p 8 "+inputname+".sorted"
#argString = "cufflinks -G Reference/ORFAnnotation.gtf -o "+inputname+".cuff -u -p 8 "+inputname+".sorted"
argString = "cufflinks -q -G "+refName+" -o "+inputname+".cuff -u -p 8 "+inputname+".sorted"+quietString
returncode = subprocess.call(argString, shell = True)
+
+
def convertToFasta(inputName, strain): #equivalent to Sara's awk scripte
dir_path = os.path.dirname(os.path.realpath(__file__))
refName = dir_path + "/data/Reference/ORFAnnotation.gtf" # default
if strain == "Tc148":
refName = dir_path + "/data/Reference/148_prot.fasta"
if strain == "IL3000":
refName = dir_path + "/data/Reference/IL3000_prot.fasta"
+
cuff_df = pd.read_csv(inputName+".cuff/genes.fpkm_tracking", sep='\t')
cuff_df = cuff_df[(cuff_df['FPKM'] > 0)]
cuff_df.to_csv("cuffTest.csv")
gene_id_List = cuff_df['gene_id'].tolist()
+
#print(gene_id_List)
#print ("Found from 8880="+str(found))
+
# need to load in IL3000_prot.fasta
# for each line with >TcIL3000_1_1940
# search within cuff_df[gene_id] for match
# add it to the outfile. (need to save it as used by hmmer later
number = 0
all = 0
with open(inputName+"_6frame.fas", 'w') as outfile:
ref = open(refName,'r')
#ref = open(r"Reference/IL3000_prot.fasta",'r')
n = 0
line = ref.readline()
while line:
if line[0] == '>':
all = all+1
ln = line[1:] #remove >
ln = ln.rstrip() #remove /n /r etc
#print (ln)
if ln in gene_id_List:
number = number+1
outfile.write(line)
line = ref.readline()
if line:
while line[0] != '>':
outfile.write(line)
line=ref.readline()
if not line:
break;
else:
line = ref.readline()
else:
line =ref.readline()
ref.close()
print(str(len(gene_id_List))+":"+str(number)+" from "+str(all))
return cuff_df
+
def HMMerMotifSearch(name, strain, cuff_df):
motifs = ['1', '2a', '2b', '3', '4a', '4b', '4c', '5', '6', '7', '8a', '8b', '9a', '9b',
'9c', '10a', '10b', '11a', '11b', '12', '13a', '13b', '13c', '13d', '14', '15a', '15b', '15c']
dir_path = os.path.dirname(os.path.realpath(__file__))
phylopath = dir_path + "/data/Motifs/Phylotype"
lineCounts = []
compoundList = []
for m in motifs:
argString = "hmmsearch "+phylopath + m + ".hmm " + name + "_6frame.fas > Phy" + m + ".out"
print(argString)
subprocess.call(argString, shell=True)
hmmResult = open("Phy" + m + ".out", 'r')
regex = r"Tc148[0-9]{1,8}"
if strain == "Tc148":
regex = r"Tc148[0-9]{1,8}"
if strain == "IL3000":
regex = r"TcIL3000_[0-9]{1,4}_[0-9]{1,5}"
n = 0
outList = []
for line in hmmResult:
m = re.search(regex, line)
if m:
outList.append(""+m.group())
n += 1
if re.search(r"inclusion", line):
print("inclusion threshold reached")
break
compoundList.append(outList)
lineCounts.append(n)
hmmResult.close()
#print(lineCounts)
+
#print(cuff_df)
concatGroups = [1, 2, 1, 3, 1, 1, 1, 2, 3, 2, 2, 1, 4, 1, 3]
countList = []
weightList = []
countIndex = 0
totalCount = 0
totalWeigth = 0
for c in concatGroups:
a = []
weight = []
for n in range(0, c):
a = a + compoundList.pop(0)
t = set(a)
countList.append(len(t))
wa = 0
for w in t:
wt = cuff_df.loc[cuff_df['gene_id'] == w, 'FPKM'].iloc[0]
#print(w)
#print(wt)
wa = wa+wt
weightList.append(wa)
totalWeigth+=wa
totalCount += len(t)
countList.append(totalCount)
weightList.append(totalWeigth)
#print(countList)
#print("--------")
#print(weightList)
#print("--------")
return countList,weightList
+
def relativeFrequencyTable(countList, name, htmlresource):
relFreqList = []
c = float(countList[15])
for i in range(0, 15):
relFreqList.append(countList[i] / c)
+
data = {'Phylotype': pList, 'Relative Frequency': relFreqList}
relFreq_df = pd.DataFrame(data)
j_fname = htmlresource+ "/" + name + "_t_relative_frequency.csv"
relFreq_df.to_csv(j_fname)
return relFreqList # 0-14 = p1-p15 counts [15] = total counts
+
+
def weightedFrequencyTable(countList, name, htmlresource):
relFreqList = []
c = float(countList[15])
for i in range(0, 15):
relFreqList.append(countList[i] / c)
+
data = {'Phylotype': pList, 'Weighted Frequency': relFreqList}
relFreq_df = pd.DataFrame(data)
j_fname = htmlresource+ "/" + name + "_t_weighted_frequency.csv"
relFreq_df.to_csv(j_fname)
return relFreqList # 0-14 = p1-p15 counts [15] = total counts
+
+
+
def createStackedBar(name,freqList,strain,pdf,html_resource):
palette = ["#0000ff", "#6495ed", "#00ffff", "#caff70",
"#228b22", "#528b8b", "#00ff00", "#a52a2a",
"#ff0000", "#ffff00", "#ffa500", "#ff1493",
"#9400d3", "#bebebe", "#000000", "#ff00ff"]
+
VAP_148 = [0.072, 0.032, 0.032, 0.004, 0.007,
0.005, 0.202, 0.004, 0.006, 0.014,
0.130, 0.133, 0.054, 0.039, 0.265]
+
VAP_IL3000 = [0.073, 0.040, 0.049, 0.018, 0.060,
0.055, 0.054, 0.025, 0.012, 0.060,
0.142, 0.100, 0.061, 0.078, 0.172]
cmap = plt.cm.get_cmap('tab20')
palette = [cmap(i) for i in range(cmap.N)]
+
if strain == "Tc148":
VAPtable = VAP_148
VAPname='Tc148\nGenome VAP'
if strain == "IL3000":
VAPtable = VAP_IL3000
VAPname= 'IL3000\nGenome VAP'
width = 0.35 # the width of the bars: can also be len(x) sequence
plots = []
fpos = 0
vpos = 0
for p in range(0, 15):
tp = plt.bar(0, freqList[p], width, color= palette[p], bottom = fpos)
fpos +=freqList[p]
+
tp = plt.bar(1, VAPtable[p], width, color= palette[p], bottom = vpos)
vpos +=VAPtable[p]
+
plots.append(tp)
plt.xticks([0,1],[name,VAPname])
plt.legend(plots[::-1],['p15','p14','p13','p12','p11','p10','p9','p8','p7','p6','p5','p4','p3','p2','p1'])
title = "Figure Legend: The transcriptomic Variant Antigen Profile of $\itTrypanosoma$ $\itcongolense$ estimated as phylotype " \
"proportion adjusted for transcript abundance and the reference genomic Variant Antigen Profile. " \
"\nData was produced with the 'Variant Antigen Profiler' (Silva Pereira et al., 2019)."
#plt.title(title, wrap="True")
#plt.text(-0.2, -0.05, title, va="top", transform=ax.transAxes, wrap="True")
plt.text(-0.3, -0.15, title, va="top", wrap="True")
plt.tight_layout(pad=1.5)
plt.subplots_adjust(bottom = 0.3,top=0.99,left=0.125,right=0.9,hspace=0.2,wspace=0.2)
+
plt.savefig(html_resource + "/stackedbar.png")
if pdf == 'PDF_Yes':
plt.savefig(html_resource + "/stackedbar.pdf")
#plt.show()
+
+
def createHTML(name,htmlfn,htmlresource,freqList,weightList):
#assumes imgs are heatmap.png, dheatmap.png, vapPCA.png and already in htmlresource
htmlString = r"<html><title>T.congolense VAP</title><body><div style='text-align:center'><h2><i>Trypanosoma congolense</i> Variant Antigen Profile</h2><h3>"
htmlString += name
htmlString += r"<br>Transcriptomic Analysis</h3></p>"
htmlString += "<p style = 'margin-left:20%; margin-right:20%'>Table Legend: Variant Antigen Profiles of a transcriptome of <i>Trypanosoma congolense</i> estimated as phylotype proportion. " \
"Weighted frequency refers to the phylotype proportion based transcript abundance. " \
"Data was produced with the 'Variant Antigen Profiler' (Silva Pereira et al., 2019).</p> "
htmlString += r"<style> table, th, tr, td {border: 1px solid black; border-collapse: collapse;}</style>"
+
htmlString += r"<table style='width:50%;margin-left:25%;text-align:center'><tr><th>Phylotype</th><th>Relative Frequency</th><th>Weighted Frequency</th></tr>"
tabString = ""
# flush out table with correct values
for i in range(0, 15):
f = format(freqList[i], '.4f')
w = format(weightList[i], '.4f')
tabString += "<tr><td>phy" + str(i + 1) + "</td><td>" + f + "</td><td>" + w + "</td></tr>"
htmlString += tabString + "</table><br><br><br><br><br>"
htmlString += r"<p> <h3>Stacked Bar chart of Phylotype Frequency</h3> The 'weighted' relative frequency of each phylotype alongside the VAP of selected strain.</p>"
imgString = r"<img src = 'stackedbar.png' alt='Stacked bar chart of phylotype variation' style='max-width:100%'><br><br>"
htmlString += imgString
+
# htmlString += r"<p><h3>The Deviation Heat Map and Dendogram</h3>The phylotype variation expressed as the deviation from your sample mean compared to the model dataset</p>"
# imgString = r"<img src = 'dheatmap.png' alt='Deviation Heatmap' style='max-width:100%'><br><br>"
# htmlString += imgString
+
# htmlString += r"<p><h3>The Variation PCA plot</h3>PCA analysis corresponding to absolute variation. Colour coded according to location</p>"
# imgString = r"<img src = 'vapPCA.png' alt='PCA Analysis' style='max-width:100%'><br><br>"
# htmlString += imgString + r"</div></body></html>"
+
with open(htmlfn, "w") as htmlfile:
htmlfile.write(htmlString)
+
#argdict = {'name':2, 'pdfexport': 3, 'strain': 4, 'forward': 5, 'reverse': 6, 'html_file': 7, 'html_resource': 8}
def transcriptomicProcess(args,dict):
transcriptMapping(args[dict['name']], args[dict['strain']], args[dict['forward']], args[dict['reverse']]) #uses bowtie
processSamFiles(args[dict['name']]) #uses samtools
transcriptAbundance(args[dict['name']],args[dict['strain']]) #uses cufflinks -> ?.cuff/*.*
cuff_df = convertToFasta(args[dict['name']],args[dict['strain']])
countList, weightList = HMMerMotifSearch(args[dict['name']],args[dict['strain']], cuff_df)
relFreqList = relativeFrequencyTable(countList,args[dict['name']],args[dict['html_resource']])
relWeightList = weightedFrequencyTable(weightList,args[dict['name']],args[dict['html_resource']])
createStackedBar(args[dict['name']],relWeightList, args[dict['strain']],args[dict['pdfexport']],args[dict['html_resource']])
createHTML(args[dict['name']],args[dict['html_file']],args[dict['html_resource']], relFreqList, relWeightList)
+
if __name__ == "__main__":
#print("Commencing Transcript Mapping")
#transcriptMapping("T_Test", "Transcripts.1","Transcripts.2")
#print("Processimg Sam Files")
#processSamFiles("T_Test")
#print("Assessing Transcript Abundance")
#transcriptAbundance("T_Test")
#print ("Converting to Fasta Subset")
#cuff_df = convertToFasta("T_Test")
#print("Commencing HMMer search")
#countList, weightList = HMMerMotifSearch("T_Test",cuff_df)
#relativeFrequencyTable(countList,'T_Test')
#weightedFrequencyTable(weightList,'T_Test')
relFreqList = [0.111842105,0.059210526,0.026315789,0.013157895,
0.006578947,0.013157895,0.032894737,0.019736842,
0.039473684,0.046052632,0.217105263,0.065789474,
0.151315789,0.059210526,0.138157895]
+
relWeightList = [0.07532571,0.05900545,0.009601452,0.042357532,0.01236219,0.001675663,0.04109726,
0.097464248,0.057491666,0.05826875,0.279457473,0.070004772,0.065329007,0.085361298,0.045197529]
+
createStackedBar('T_Test',relWeightList, 'Tc148','PDF_Yes','results')
createHTML("t_test","results/t_test.html","results",relFreqList,relWeightList)
+ + + +
+ +
+ + + +
+ + +
+ + +
+
+ + + +
+ +
+ +
+
+ + +
+ + + + + + +
+ + + You can’t perform that action at this time. +
+ + + + + + + + + + + + + + +
+ + + +