changeset 37:28f1633ede7b draft

Uploaded
author tyty
date Mon, 20 Oct 2014 14:56:56 -0400
parents acb522d3fb0d
children b35dc7b728e5
files predict/.DS_Store predict/._.DS_Store predict/parse_dis_pac.py predict/predict_RNAs.py predict/predict_RNAs.xml predict/rRNA.txt predict/read_file.py predict/read_file.pyc predict/rtts_plot.py predict/rtts_plot.pyc
diffstat 10 files changed, 280 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file predict/.DS_Store has changed
Binary file predict/._.DS_Store has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/predict/parse_dis_pac.py	Mon Oct 20 14:56:56 2014 -0400
@@ -0,0 +1,43 @@
+#parse reactivity file into a dictionary
+
+import sys
+
+def parse_dist(in_file):
+    result = []
+    distribution = {}
+    name = []
+    f = open(in_file)
+    for aline in f.readlines():
+        line = aline.strip()
+        dis = line.strip()
+        dist = dis.split('\t') #split the line and the reactivites or reads are in a list
+        if len(dist) > 0:
+            if len(dist) == 1:
+                if dist[0].strip().find('coverage')==-1:
+                    name.append(line) #add the name in the name list
+                    flag = 1
+                    t_name = line
+            else:
+                distri = []
+                for i in range(0, len(dist)):
+                    distri.append(dist[i].strip())
+                distribution[t_name] = distri #add the list of reactivities into a dictionary
+    result.append(name)
+    result.append(distribution) #Output the dictionary
+    f.close()
+    return result
+                
+                
+
+
+
+
+
+
+
+        
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/predict/predict_RNAs.py	Mon Oct 20 14:56:56 2014 -0400
@@ -0,0 +1,93 @@
+#RNA structure prediction & Output and illustrate reactivities
+
+import sys
+from parse_dis_pac import *
+from read_file import *
+from Bio import SeqIO
+import os
+from rtts_plot import *
+
+
+id_file = sys.argv[1]
+seq_file = sys.argv[2]
+output_file = sys.argv[4]
+
+
+flag = 0
+if sys.argv[3]!='None': #input reactivity file if provided
+    react_file = sys.argv[3]
+    react = parse_dist(react_file)
+    react = react[1]
+    flag = 1
+
+ospath = os.path.realpath(sys.argv[0])
+ost = ospath.split('/')
+syspath = ""
+for i in range(len(ost)-1):
+    syspath = syspath+ost[i].strip()
+    syspath = syspath+'/' 
+
+ids = read_t_file(id_file)
+sequences = SeqIO.parse(seq_file, 'fasta')
+
+
+seqs = {}
+for seq in sequences:
+    seqs[seq.id] = seq.seq.tostring()
+
+if len(ids)>10: #setup a limit of the number of sequence to be predicted
+    print("Number of sequences exceeds limitation!")
+    sys.exit(0)
+    
+
+#predict RNA structures
+os.system("mkdir "+syspath+"output_f")
+for i in range(len(ids)):
+    id_s = ids[i][0]
+    print(id_s)
+    #Put RNA sequence and reactivities into files
+    if id_s in seqs:
+        f = file(syspath+"temp.txt", 'w')        
+        f.write('>'+id_s)
+        f.write('\n')
+        f.write(seqs[id_s])
+        f.close()
+        if flag == 0:
+            os.system("Fold "+syspath+"temp.txt"+" "+syspath+"output_f/"+id_s+".ct")
+        if flag == 1:
+            if id_s in react:
+                f = file(syspath+"constraint.txt",'w')
+                make_plot(react[id_s],id_s,(syspath+"output_f/")) #make a plot of the distribution of the reactivites of the input RNA
+                #h = file(syspath+"output_f/transcript_reactivities.txt", 'w')
+                #h.write(id_s)
+                #h.write('\n')
+                for j in range(0, (len(react[id_s]))):
+                    if react[id_s][j]!='NA':
+                        f.write(str(j+1))
+                        f.write('\t')
+                        f.write(str(react[id_s][j]))
+                        f.write('\n')
+                    #h.write(str(react[id_s][j])) #Output the reactivities
+                    #h.write('\t')
+                f.close()
+                #h.write('\n')
+                #h.write('\n')
+                os.system("Fold "+syspath+"temp.txt"+" -sh"+" "+syspath+"constraint.txt"+" "+syspath+"output_f/"+id_s+".ct")
+            else:
+                print(id_s+" not in the data of react!")
+        os.system("draw "+syspath+"output_f/"+id_s+".ct "+syspath+"output_f/"+id_s+".ps")
+    else:
+        print(id_s+" not in the data of sequences!")
+
+#Remove the unnecessary files
+os.system("tar -zcvPf "+output_file+" "+syspath+"output_f/"+"*.* 2>"+syspath+"log.txt")
+os.system("rm -f "+syspath+"temp.txt")
+os.system("rm -r "+syspath+"output_f")
+if flag == 1:
+    os.system("rm -f "+syspath+"constraint.txt")
+ #   h.close()
+    
+    
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/predict/predict_RNAs.xml	Mon Oct 20 14:56:56 2014 -0400
@@ -0,0 +1,59 @@
+<tool id="predict_pipeline" name="RNA structure prediction" version="1.0">
+	<description></description>
+	<command interpreter="python">predict_RNAs.py $rna_list $reference_file $reactivity_file $output </command>
+        <requirements>
+                <requirement type="package" version="1.61">biopython</requirement>
+                <requirement type="package" version="1.7">numpy</requirement>
+                <requirement type="package" version="1.2">matplotlib</requirement>
+        </requirements>
+	<inputs>
+        <param name="rna_list" type="data" format="txt" label="List of RNA ids to predict"/>
+        <param name="reference_file" type="data" format="fasta" label="Reference genome/transcriptome"/>
+        <param name="reactivity_file" type="data" optional = "true" label="Reactivity"/>
+	
+	</inputs>
+	<outputs>
+		<data name="output" format=".tgz"/>
+	</outputs>
+    <tests>
+        <test>
+            <param name="rna_list" value="id_list_test.txt" />
+	        <param name="reference_file" value="cdna.txt" />
+            <param name="reactivity_file" value="mRNA_react_test2.txt" />
+	        <output name="output" file="structures.out" />
+        </test>
+    </tests>
+	<help>
+
+
+**TIPS**:
+
+-----
+
+**Input**:
+
+* 1. A file with transcript Ids (Max num. 20), (each ID one line)
+* 2. Reference file (fasta) used to map the reads
+* [Optional]:
+* 1. A reactivity file with structural reactivity for each nucleotide on the sequence provided
+
+-----
+
+**Output**:
+
+* 1. .ct files with predicted RNA structures [transciptID.ct]
+* 2. .ps files which depict the predicted RNA structures [[transciptID.ps]
+* [Optional]
+* 3. .png files that shows the distribution of the reactivity of each nucleotide on the transcripts of interest. [transciptID.png]
+* 4. A .txt file that includes the reactivities of all the nucleotides on the transcripts of interest. [transciptID.txt]
+
+-----
+
+**Attention**
+
+Make sure any of the transcript Ids does not contain "|" or space!	
+
+
+
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/predict/rRNA.txt	Mon Oct 20 14:56:56 2014 -0400
@@ -0,0 +1,8 @@
+>25s rRNA 3375nts
+GCGACCCCAGGTCAGGCGGGATTACCCGCTGAGTTTAAGCATATCAATAAGCGGAGGAAAAGAAACTAACAAGGATTCCCTTAGTAACGGCGAGCGAACCGGGAAGAGCCCAGCTTGAAAATCGGACGTCTTCGGCGTTCGAATTGTAGTCTGGAGAAGCGTCCTCAGCGACGGACCGGGCCTAAGTTCCCTGGAAAGGGGCGCCAGAGAGGGTGAGAGCCCGTCGTGCCCGGACCCTGTCGCACCACGAGGCGCTGTCTACGAGTCGGGTTGTTTGGGAATGCAGCCCCAATCGGGCGGTAAATTCCGTCCAAGGCTAAATACGGGCGAGAGACCGATAGCGAACAAGTACCGCGAGGTAAAGATGAAAAGGACTTTGAAAAGAGAGTCAAAGAGTGCTTGAAATTGTCGGGAGGGAAGCGGATGGGGGCCGGCGATGCGTCCTGGTCGGATGCGGAACGGAGCAATCCGGTCCGCCGATCGATTCGGGGCGTGGACCGACGCGGATTACGGTGGCGGCCTAAGCCCGGGCTTTTGATACGCTTGTGGAGACGTCGCTGCCGTGATCGTGGTCTGCAGCACGCGCCTAACGGCGTGCCTCGGCATCAGCGTGCTCCGGGCGTCGGCCTGTGGGCTCCCCATTCGACCCGTCTTGAAACACGGACCAAGGAGTCTGACATGTGTGCGAGTCAACGGGTGAGTAAACCCGTAAGGCGCAAGGAAGCTGATTGGCGGGATCCTCGCGGGTGCACCGCCGACCGACCTTGATCTTCTGAGAAGGGTTCGAGTGTGAGCATGCCTGTCGGGACCCGAAAGATGGTGAACTATGCCTGAGCGGGGTAAAGCCAGAGGAAACTCTGGTGGAAGCCCGCAGCGATACTGACGTGCAAATCGTTCGTCTGACTTGGGTATAGGGGCGAAAGACTAATCGAACCATCTAGTAGCTGGTTCCCTCCGAAGTTTCCCTCAGGATAGCTGGAGCTCGGACGCGAGTTCTATCGGGTAAAGCCAATGATTAGAGGCATTGGGGGCGCAACGCCTCGACCTATTCTCAAACTTTAAATAGGTAGGACGTGTCGGCTGCTTTGTTGAGCCGTCACACGGAATCGAGAGCTCCAAGTGGGCCATTTTTGGTAAGCAGAACTGGCGATGCGGGATGAACCGGAAGCCGGGTTACGGTGCCCAACTGCGCGCTAACCTAGAACCCACAAAGGGTGTTGGTCGATTAAGACAGCAGGACGGTGGTCATGGAAGTCGAAATCCGCTAAGGAGTGTGTAACAACTCACCTGCCGAATCAACTAGCCCCGAAAATGGATGGCGCTTAAGCGCGACCTATACCCGGCCGTCGGGGCAAGAGCCAGGCCTCGATGAGTAGGAGGGCGCGGCGGTCGCTGCAAAACCTAGGGCGCGAGGCGCGGAGCGGCCGTCGGTGCAGATCTTGGTGGTAGTAGCAAATATTCAAATGAGAACTTTGAAGGCCGAAGAGGGGAAAGGTTCCATGTGAACGGCACTTGCACATGGGTTAGTCGATCCTAAGAGTCGGGGGAAACCCGTCTGATAGCGCTTAAGCGAACTTCGAAAGGGGATCCGGTTAAAATTCCGGAACCGGGACGTGGCGGTTGACGGCAACGTTAGGGAGTCCGGAGACGTCGGCGGGGGCCTCGGGAAGAGTTATCTTTTCTGTTTAACAGCCTGCCCACCCTGGAAACGGCTCAGCCGGAGGTAGGGTCCAGCGGCTGGAAGAGCACCGCACGTCGCGTGGTGTCCGGTGCGCCCCCGGGCGCCCTTGAAAATCCGGAGGACCGAGTGCCGCTCACGCCCGGTCGTACTCATAACCGCATCAGGTCTCCAAGGTGAACAGCCTCTGGTCGATGGAACAATGTAGGCAAGGGAAGTCGGCAAAATGGATCCGTAACTTCGGGAAAAGGATTGGCTCTGAGGGCTGGGCTCGGGGGTCCCAGTTCCGAACCCGTCGGCTGTCAGCGGACTGCTCGAGCTGCTTCCGCGGCGAGAGCGGGTCGCCGGCTGCCGGCCGGGGGACGACTGGGAACGGCTCTCTCGGGAGCTTTCCCCGGGCGTCGAACAGTCAGCTCAGAACTGGTACGGACAAGGGGAATCCGACTGTTTAATTAAAACAAAGCATTGCGATGGTCCCTGCGGATGCTAACGCAATGTGATTTCTGCCCAGTGCTCTGAATGTCAAAGTGAAGAAATTCAACCAAGCGCGGGTAAACGGCGGGAGTAACTATGACTCTCTTAAGGTAGCCAAATGCCTCGTCATCTAATTAGTGACGCGCATGAATGGATTAACGAGATTCCCACTGTCCCTGTCTACTATCCAGCGAAACCACAGCCAAGGGAACGGGCTTGGCAGAATCAGCGGGGAAAGAAGACCCTGTTGAGCTTGACTCTAGTCCGACTTTGTGAAATGACTTGAGAGGTGTAGGATAAGTGGGAGCTTCGGCGCAAGTGAAATACCACTACTTTTAACGTTATTTTACTTACTCCGTGAATCGGAGGCCGGGGTACAACCCCTGTTTTTGGTCCCAAGGCTCGCTTCGGCGGGTCGATCCGGGCGGAGGACATTGTCAGGTGGGGAGTTTGGCTGGGGCGGCACATCTGTTAAAAGATAACGCAGGTGTCCTAAGATGAGCTCAACGAGAACAGAAATCTCGTGTGGAACAAAAGGGTAAAAGCTCGTTTGATTCTGATTTTCAGTACGAATACGAACCGTGAAAGCGTGGCCTATCGATCCTTTAGACTTCGGAATTTGAAGCTAGAGGTGTCAGAAAAGTTACCACAGGGATAACTGGCTTGTGGCAGCCAAGCGTTCATAGCGACGTTGCTTTTTGATCCTTCGATGTCGGCTCTTCCTATCATTGTGAAGCAGAATTCACCAAGTGTTGGATTGTTCACCCACCAATAGGGAACGTGAGCTGGGTTTAGACCGTCGTGAGACAGGTTAGTTTTACCCTACTGATGCCCGCGTCGCGATAGTAATTCAACCTAGTACGAGAGGAACCGTTGATTCGCACAATTGGTCATCGCGCTTGGTTGAAAAGCCAGTGGCGCGAAGCTACCGTGCGCTGGATTATGACTGAACGCCTCTAAGTCAGAATCCGGGCTAGAAGCGACGCATGCGCCCGCCGCCCGATTGCCGACCCTCAGTAGGAGCTTAGGCTCCAAAGGCACGTGTCGTTGGCTAAGTCCGTTCGGCGGAACGGTCGTTCGGACCGCCTTGAATTATAATTACCACCGAGCGGCGGGTAGAATCCTTTGCAGACGACTTAAATACGCGACGGGGTATTGTAAGTGGCAGAGTGGCCTTGCTGCCACGATCCACTGAGATTCAGCCCTTTGTCGCTAAGATTCGA
+>gi|20197903:2706-4513 Arabidopsis thaliana chromosome 2 BAC F23H14 genomic sequence, complete sequence
+TACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTAAGCCATGCATGTGTAAGTATGAACGAATTCAGACTGTGAAACTGCGAATGGCTCATTAAATCAGTTATAGTTTGTTTGATGGTAACTACTACTCGGATAACCGTAGTAATTCTAGAGCTAATACGTGCAACAAACCCCGACTTATGGAAGGGACGCATTTATTAGATAAAAGGTCGACGCGGGCTCTGCCCGTTGCTCTGATGATTCATGATAACTCGACGGATCGCATGGCCTCTGTGCTGGCGACGCATCATTCAAATTTCTGCCCTATCAACTTTCGATGGTAGGATAGTGGCCTACCATGGTGGTAACGGGTGACGGAGAATTAGGGTTCGATTCCGGAGAGGGAGCCTGAGAAACGGCTACCACATCCAAGGAAGGCAGCAGGCGCGCAAATTACCCAATCCTGACACGGGGAGGTAGTGACAATAAATAACAATACTGGGCTCTTTCGAGTCTGGTAATTGGAATGAGTACAATCTAAATCCCTTAACGAGGATCCATTGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTCCAATAGCGTATATTTAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAACCTTGGGATGGGTCGGCCGGTCCGCCTTTGGTGTGCATTGGTCGGCTTGTCCCTTCGGTCGGCGATACGCTCCTGGTCTTAATTGGCCGGGTCGTGCCTCCGGCGCTGTTACTTTGAAGAAATTAGAGTGCTCAAAGCAAGCCTACGCTCTGGATACATTAGCATGGGATAACATCATAGGATTTCGATCCTATTGTGTTGGCCTTCGGGATCGGAGTAATGATTAACAGGGACAGTCGGGGGCATTCGTATTTCATAGTCAGAGGTGAAATTCTTGGATTTATGAAAGACGAACAACTGCGAAAGCATTTGCCAAGGATGTTTTCATTAATCAAGAACGAAAGTTGGGGGCTCGAAGACGATCAGATACCGTCCTAGTCTCAACCATAAACGATGCCGACCAGGGATCAGCGGATGTTGCTTATAGGACTCCGCTGGCACCTTATGAGAAATCAAAGTTTTTGGGTTCCGGGGGGAGTATGGTCGCAAGGCTGAAACTTAAAGGAATTGACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGGAAACTTACCAGGTCCAGACATAGTAAGGATTGACAGACTGAGAGCTCTTTCTTGATTCTATGGGTGGTGGTGCATGGCCGTTCTTAGTTGGTGGAGCGATTTGTCTGGTTAATTCCGTTAATGAACGAGACCTCAGCCTGCTAACTAGCTACGTGGAGGCATCCCTTCACGGCCGGCTTCTTAGAGGGACTATGGCCGTTTAGGCCAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGATGTATTCAACGAGTTCACACCTTGGCCGACAGGCCCGGGTAATCTTTGAAATTTCATCGTGATGGGGATAGATCATTGCAATTGTTGGTCTTCAACGAGGAATTCCTAGTAAGCGCGAGTCATCAGCTCGCGTTGACTACGTCCCTGCCCTTTGTACACACCGCCCGTCGCTCCTACCGATTGAATGATCCGGTGAAGTGTTCGGATCGCGGCGACGTGGGTGGTTCGCCGCCCGCGACGTCGCGAGAAGTCCACTAAACCTTATCATTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTG
+>Arabidopsis thaliana 1
+GGATGCGATCATACCAGCACTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGTGCTTGGGCGAGAGTAGTACTAGGATGGGTGACCTCCTGGGAAGTCCTCGTGTTGCATCCCTC
+>gi|186498419|ref|NR_022453.1| Arabidopsis thaliana (AT2G01020) rRNA
+AAAACGACTCTCGGCAACGGATATCTCGGCTCTCGCATCGATGAAGAACGTAGCGAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCCAAGCCTTCTGGCCGAGGGCACGTCTGCCTGGGTGTCACAA
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/predict/read_file.py	Mon Oct 20 14:56:56 2014 -0400
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+
+
+
+def read_t_file(in_file):
+    f = open(in_file);
+    result = [];
+    for aline in f.readlines():
+        temp = [];
+        tline = aline.strip();
+        tl = tline.split('\t');
+        for i in range(0, len(tl)):
+            temp.append(tl[i].strip());
+        result.append(temp);
+    f.close();
+    return result;
+
+
Binary file predict/read_file.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/predict/rtts_plot.py	Mon Oct 20 14:56:56 2014 -0400
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+#Make a plot of reactivity distribution
+
+import sys
+import numpy as np
+import matplotlib
+from pylab import *
+import math
+
+#Convert the reactivities (Make NA to 0)
+def convert_react(a):
+    r = []
+    for i in range(len(a)):
+        if a[i]!='NA':
+            r.append(float(a[i]))
+        else:
+            r.append(float(0))
+    return r
+        
+
+#Make a plot of the distribution
+def make_plot(ar,id_s,path):
+    N = len(ar)
+    a = convert_react(ar)
+    w = 1
+    ind = np.arange(N)
+
+    fig = figure()
+    fig, ax = subplots()
+    ax.bar(ind+w, a, width = w, color = 'r',edgecolor = 'r')
+    ax.set_ylabel('DMS Reactivity')
+    ax.set_xlabel('Nucleotide Index')
+
+    
+    mag = int(math.log(N,10))-1
+    tail = 10**mag
+
+    intervel = int(math.ceil(float(N)/tail)/5)
+    print(N)
+    print(intervel)
+    tl = []
+    k = 0
+    ax.set_xticks(np.arange(0,N,intervel*tail))
+    print(np.arange(0,N,intervel*tail))
+    ax.set_xticklabels(np.arange(0,N,intervel*tail))
+
+    ax.set_title(id_s+" reactivity distribution")
+    savefig(path+id_s+'.tif')
+
+
+
+    
+    
+    
+
+
Binary file predict/rtts_plot.pyc has changed