Mercurial > repos > tyty > structurefold
changeset 37:28f1633ede7b draft
Uploaded
author | tyty |
---|---|
date | Mon, 20 Oct 2014 14:56:56 -0400 |
parents | acb522d3fb0d |
children | b35dc7b728e5 |
files | predict/.DS_Store predict/._.DS_Store predict/parse_dis_pac.py predict/predict_RNAs.py predict/predict_RNAs.xml predict/rRNA.txt predict/read_file.py predict/read_file.pyc predict/rtts_plot.py predict/rtts_plot.pyc |
diffstat | 10 files changed, 280 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/parse_dis_pac.py Mon Oct 20 14:56:56 2014 -0400 @@ -0,0 +1,43 @@ +#parse reactivity file into a dictionary + +import sys + +def parse_dist(in_file): + result = [] + distribution = {} + name = [] + f = open(in_file) + for aline in f.readlines(): + line = aline.strip() + dis = line.strip() + dist = dis.split('\t') #split the line and the reactivites or reads are in a list + if len(dist) > 0: + if len(dist) == 1: + if dist[0].strip().find('coverage')==-1: + name.append(line) #add the name in the name list + flag = 1 + t_name = line + else: + distri = [] + for i in range(0, len(dist)): + distri.append(dist[i].strip()) + distribution[t_name] = distri #add the list of reactivities into a dictionary + result.append(name) + result.append(distribution) #Output the dictionary + f.close() + return result + + + + + + + + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/predict_RNAs.py Mon Oct 20 14:56:56 2014 -0400 @@ -0,0 +1,93 @@ +#RNA structure prediction & Output and illustrate reactivities + +import sys +from parse_dis_pac import * +from read_file import * +from Bio import SeqIO +import os +from rtts_plot import * + + +id_file = sys.argv[1] +seq_file = sys.argv[2] +output_file = sys.argv[4] + + +flag = 0 +if sys.argv[3]!='None': #input reactivity file if provided + react_file = sys.argv[3] + react = parse_dist(react_file) + react = react[1] + flag = 1 + +ospath = os.path.realpath(sys.argv[0]) +ost = ospath.split('/') +syspath = "" +for i in range(len(ost)-1): + syspath = syspath+ost[i].strip() + syspath = syspath+'/' + +ids = read_t_file(id_file) +sequences = SeqIO.parse(seq_file, 'fasta') + + +seqs = {} +for seq in sequences: + seqs[seq.id] = seq.seq.tostring() + +if len(ids)>10: #setup a limit of the number of sequence to be predicted + print("Number of sequences exceeds limitation!") + sys.exit(0) + + +#predict RNA structures +os.system("mkdir "+syspath+"output_f") +for i in range(len(ids)): + id_s = ids[i][0] + print(id_s) + #Put RNA sequence and reactivities into files + if id_s in seqs: + f = file(syspath+"temp.txt", 'w') + f.write('>'+id_s) + f.write('\n') + f.write(seqs[id_s]) + f.close() + if flag == 0: + os.system("Fold "+syspath+"temp.txt"+" "+syspath+"output_f/"+id_s+".ct") + if flag == 1: + if id_s in react: + f = file(syspath+"constraint.txt",'w') + make_plot(react[id_s],id_s,(syspath+"output_f/")) #make a plot of the distribution of the reactivites of the input RNA + #h = file(syspath+"output_f/transcript_reactivities.txt", 'w') + #h.write(id_s) + #h.write('\n') + for j in range(0, (len(react[id_s]))): + if react[id_s][j]!='NA': + f.write(str(j+1)) + f.write('\t') + f.write(str(react[id_s][j])) + f.write('\n') + #h.write(str(react[id_s][j])) #Output the reactivities + #h.write('\t') + f.close() + #h.write('\n') + #h.write('\n') + os.system("Fold "+syspath+"temp.txt"+" -sh"+" "+syspath+"constraint.txt"+" "+syspath+"output_f/"+id_s+".ct") + else: + print(id_s+" not in the data of react!") + os.system("draw "+syspath+"output_f/"+id_s+".ct "+syspath+"output_f/"+id_s+".ps") + else: + print(id_s+" not in the data of sequences!") + +#Remove the unnecessary files +os.system("tar -zcvPf "+output_file+" "+syspath+"output_f/"+"*.* 2>"+syspath+"log.txt") +os.system("rm -f "+syspath+"temp.txt") +os.system("rm -r "+syspath+"output_f") +if flag == 1: + os.system("rm -f "+syspath+"constraint.txt") + # h.close() + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/predict_RNAs.xml Mon Oct 20 14:56:56 2014 -0400 @@ -0,0 +1,59 @@ +<tool id="predict_pipeline" name="RNA structure prediction" version="1.0"> + <description></description> + <command interpreter="python">predict_RNAs.py $rna_list $reference_file $reactivity_file $output </command> + <requirements> + <requirement type="package" version="1.61">biopython</requirement> + <requirement type="package" version="1.7">numpy</requirement> + <requirement type="package" version="1.2">matplotlib</requirement> + </requirements> + <inputs> + <param name="rna_list" type="data" format="txt" label="List of RNA ids to predict"/> + <param name="reference_file" type="data" format="fasta" label="Reference genome/transcriptome"/> + <param name="reactivity_file" type="data" optional = "true" label="Reactivity"/> + + </inputs> + <outputs> + <data name="output" format=".tgz"/> + </outputs> + <tests> + <test> + <param name="rna_list" value="id_list_test.txt" /> + <param name="reference_file" value="cdna.txt" /> + <param name="reactivity_file" value="mRNA_react_test2.txt" /> + <output name="output" file="structures.out" /> + </test> + </tests> + <help> + + +**TIPS**: + +----- + +**Input**: + +* 1. A file with transcript Ids (Max num. 20), (each ID one line) +* 2. Reference file (fasta) used to map the reads +* [Optional]: +* 1. A reactivity file with structural reactivity for each nucleotide on the sequence provided + +----- + +**Output**: + +* 1. .ct files with predicted RNA structures [transciptID.ct] +* 2. .ps files which depict the predicted RNA structures [[transciptID.ps] +* [Optional] +* 3. .png files that shows the distribution of the reactivity of each nucleotide on the transcripts of interest. [transciptID.png] +* 4. A .txt file that includes the reactivities of all the nucleotides on the transcripts of interest. [transciptID.txt] + +----- + +**Attention** + +Make sure any of the transcript Ids does not contain "|" or space! + + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/rRNA.txt Mon Oct 20 14:56:56 2014 -0400 @@ -0,0 +1,8 @@ +>25s rRNA 3375nts +GCGACCCCAGGTCAGGCGGGATTACCCGCTGAGTTTAAGCATATCAATAAGCGGAGGAAAAGAAACTAACAAGGATTCCCTTAGTAACGGCGAGCGAACCGGGAAGAGCCCAGCTTGAAAATCGGACGTCTTCGGCGTTCGAATTGTAGTCTGGAGAAGCGTCCTCAGCGACGGACCGGGCCTAAGTTCCCTGGAAAGGGGCGCCAGAGAGGGTGAGAGCCCGTCGTGCCCGGACCCTGTCGCACCACGAGGCGCTGTCTACGAGTCGGGTTGTTTGGGAATGCAGCCCCAATCGGGCGGTAAATTCCGTCCAAGGCTAAATACGGGCGAGAGACCGATAGCGAACAAGTACCGCGAGGTAAAGATGAAAAGGACTTTGAAAAGAGAGTCAAAGAGTGCTTGAAATTGTCGGGAGGGAAGCGGATGGGGGCCGGCGATGCGTCCTGGTCGGATGCGGAACGGAGCAATCCGGTCCGCCGATCGATTCGGGGCGTGGACCGACGCGGATTACGGTGGCGGCCTAAGCCCGGGCTTTTGATACGCTTGTGGAGACGTCGCTGCCGTGATCGTGGTCTGCAGCACGCGCCTAACGGCGTGCCTCGGCATCAGCGTGCTCCGGGCGTCGGCCTGTGGGCTCCCCATTCGACCCGTCTTGAAACACGGACCAAGGAGTCTGACATGTGTGCGAGTCAACGGGTGAGTAAACCCGTAAGGCGCAAGGAAGCTGATTGGCGGGATCCTCGCGGGTGCACCGCCGACCGACCTTGATCTTCTGAGAAGGGTTCGAGTGTGAGCATGCCTGTCGGGACCCGAAAGATGGTGAACTATGCCTGAGCGGGGTAAAGCCAGAGGAAACTCTGGTGGAAGCCCGCAGCGATACTGACGTGCAAATCGTTCGTCTGACTTGGGTATAGGGGCGAAAGACTAATCGAACCATCTAGTAGCTGGTTCCCTCCGAAGTTTCCCTCAGGATAGCTGGAGCTCGGACGCGAGTTCTATCGGGTAAAGCCAATGATTAGAGGCATTGGGGGCGCAACGCCTCGACCTATTCTCAAACTTTAAATAGGTAGGACGTGTCGGCTGCTTTGTTGAGCCGTCACACGGAATCGAGAGCTCCAAGTGGGCCATTTTTGGTAAGCAGAACTGGCGATGCGGGATGAACCGGAAGCCGGGTTACGGTGCCCAACTGCGCGCTAACCTAGAACCCACAAAGGGTGTTGGTCGATTAAGACAGCAGGACGGTGGTCATGGAAGTCGAAATCCGCTAAGGAGTGTGTAACAACTCACCTGCCGAATCAACTAGCCCCGAAAATGGATGGCGCTTAAGCGCGACCTATACCCGGCCGTCGGGGCAAGAGCCAGGCCTCGATGAGTAGGAGGGCGCGGCGGTCGCTGCAAAACCTAGGGCGCGAGGCGCGGAGCGGCCGTCGGTGCAGATCTTGGTGGTAGTAGCAAATATTCAAATGAGAACTTTGAAGGCCGAAGAGGGGAAAGGTTCCATGTGAACGGCACTTGCACATGGGTTAGTCGATCCTAAGAGTCGGGGGAAACCCGTCTGATAGCGCTTAAGCGAACTTCGAAAGGGGATCCGGTTAAAATTCCGGAACCGGGACGTGGCGGTTGACGGCAACGTTAGGGAGTCCGGAGACGTCGGCGGGGGCCTCGGGAAGAGTTATCTTTTCTGTTTAACAGCCTGCCCACCCTGGAAACGGCTCAGCCGGAGGTAGGGTCCAGCGGCTGGAAGAGCACCGCACGTCGCGTGGTGTCCGGTGCGCCCCCGGGCGCCCTTGAAAATCCGGAGGACCGAGTGCCGCTCACGCCCGGTCGTACTCATAACCGCATCAGGTCTCCAAGGTGAACAGCCTCTGGTCGATGGAACAATGTAGGCAAGGGAAGTCGGCAAAATGGATCCGTAACTTCGGGAAAAGGATTGGCTCTGAGGGCTGGGCTCGGGGGTCCCAGTTCCGAACCCGTCGGCTGTCAGCGGACTGCTCGAGCTGCTTCCGCGGCGAGAGCGGGTCGCCGGCTGCCGGCCGGGGGACGACTGGGAACGGCTCTCTCGGGAGCTTTCCCCGGGCGTCGAACAGTCAGCTCAGAACTGGTACGGACAAGGGGAATCCGACTGTTTAATTAAAACAAAGCATTGCGATGGTCCCTGCGGATGCTAACGCAATGTGATTTCTGCCCAGTGCTCTGAATGTCAAAGTGAAGAAATTCAACCAAGCGCGGGTAAACGGCGGGAGTAACTATGACTCTCTTAAGGTAGCCAAATGCCTCGTCATCTAATTAGTGACGCGCATGAATGGATTAACGAGATTCCCACTGTCCCTGTCTACTATCCAGCGAAACCACAGCCAAGGGAACGGGCTTGGCAGAATCAGCGGGGAAAGAAGACCCTGTTGAGCTTGACTCTAGTCCGACTTTGTGAAATGACTTGAGAGGTGTAGGATAAGTGGGAGCTTCGGCGCAAGTGAAATACCACTACTTTTAACGTTATTTTACTTACTCCGTGAATCGGAGGCCGGGGTACAACCCCTGTTTTTGGTCCCAAGGCTCGCTTCGGCGGGTCGATCCGGGCGGAGGACATTGTCAGGTGGGGAGTTTGGCTGGGGCGGCACATCTGTTAAAAGATAACGCAGGTGTCCTAAGATGAGCTCAACGAGAACAGAAATCTCGTGTGGAACAAAAGGGTAAAAGCTCGTTTGATTCTGATTTTCAGTACGAATACGAACCGTGAAAGCGTGGCCTATCGATCCTTTAGACTTCGGAATTTGAAGCTAGAGGTGTCAGAAAAGTTACCACAGGGATAACTGGCTTGTGGCAGCCAAGCGTTCATAGCGACGTTGCTTTTTGATCCTTCGATGTCGGCTCTTCCTATCATTGTGAAGCAGAATTCACCAAGTGTTGGATTGTTCACCCACCAATAGGGAACGTGAGCTGGGTTTAGACCGTCGTGAGACAGGTTAGTTTTACCCTACTGATGCCCGCGTCGCGATAGTAATTCAACCTAGTACGAGAGGAACCGTTGATTCGCACAATTGGTCATCGCGCTTGGTTGAAAAGCCAGTGGCGCGAAGCTACCGTGCGCTGGATTATGACTGAACGCCTCTAAGTCAGAATCCGGGCTAGAAGCGACGCATGCGCCCGCCGCCCGATTGCCGACCCTCAGTAGGAGCTTAGGCTCCAAAGGCACGTGTCGTTGGCTAAGTCCGTTCGGCGGAACGGTCGTTCGGACCGCCTTGAATTATAATTACCACCGAGCGGCGGGTAGAATCCTTTGCAGACGACTTAAATACGCGACGGGGTATTGTAAGTGGCAGAGTGGCCTTGCTGCCACGATCCACTGAGATTCAGCCCTTTGTCGCTAAGATTCGA +>gi|20197903:2706-4513 Arabidopsis thaliana chromosome 2 BAC F23H14 genomic sequence, complete sequence +TACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTAAGCCATGCATGTGTAAGTATGAACGAATTCAGACTGTGAAACTGCGAATGGCTCATTAAATCAGTTATAGTTTGTTTGATGGTAACTACTACTCGGATAACCGTAGTAATTCTAGAGCTAATACGTGCAACAAACCCCGACTTATGGAAGGGACGCATTTATTAGATAAAAGGTCGACGCGGGCTCTGCCCGTTGCTCTGATGATTCATGATAACTCGACGGATCGCATGGCCTCTGTGCTGGCGACGCATCATTCAAATTTCTGCCCTATCAACTTTCGATGGTAGGATAGTGGCCTACCATGGTGGTAACGGGTGACGGAGAATTAGGGTTCGATTCCGGAGAGGGAGCCTGAGAAACGGCTACCACATCCAAGGAAGGCAGCAGGCGCGCAAATTACCCAATCCTGACACGGGGAGGTAGTGACAATAAATAACAATACTGGGCTCTTTCGAGTCTGGTAATTGGAATGAGTACAATCTAAATCCCTTAACGAGGATCCATTGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTCCAATAGCGTATATTTAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAACCTTGGGATGGGTCGGCCGGTCCGCCTTTGGTGTGCATTGGTCGGCTTGTCCCTTCGGTCGGCGATACGCTCCTGGTCTTAATTGGCCGGGTCGTGCCTCCGGCGCTGTTACTTTGAAGAAATTAGAGTGCTCAAAGCAAGCCTACGCTCTGGATACATTAGCATGGGATAACATCATAGGATTTCGATCCTATTGTGTTGGCCTTCGGGATCGGAGTAATGATTAACAGGGACAGTCGGGGGCATTCGTATTTCATAGTCAGAGGTGAAATTCTTGGATTTATGAAAGACGAACAACTGCGAAAGCATTTGCCAAGGATGTTTTCATTAATCAAGAACGAAAGTTGGGGGCTCGAAGACGATCAGATACCGTCCTAGTCTCAACCATAAACGATGCCGACCAGGGATCAGCGGATGTTGCTTATAGGACTCCGCTGGCACCTTATGAGAAATCAAAGTTTTTGGGTTCCGGGGGGAGTATGGTCGCAAGGCTGAAACTTAAAGGAATTGACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGGAAACTTACCAGGTCCAGACATAGTAAGGATTGACAGACTGAGAGCTCTTTCTTGATTCTATGGGTGGTGGTGCATGGCCGTTCTTAGTTGGTGGAGCGATTTGTCTGGTTAATTCCGTTAATGAACGAGACCTCAGCCTGCTAACTAGCTACGTGGAGGCATCCCTTCACGGCCGGCTTCTTAGAGGGACTATGGCCGTTTAGGCCAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGATGTATTCAACGAGTTCACACCTTGGCCGACAGGCCCGGGTAATCTTTGAAATTTCATCGTGATGGGGATAGATCATTGCAATTGTTGGTCTTCAACGAGGAATTCCTAGTAAGCGCGAGTCATCAGCTCGCGTTGACTACGTCCCTGCCCTTTGTACACACCGCCCGTCGCTCCTACCGATTGAATGATCCGGTGAAGTGTTCGGATCGCGGCGACGTGGGTGGTTCGCCGCCCGCGACGTCGCGAGAAGTCCACTAAACCTTATCATTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTG +>Arabidopsis thaliana 1 +GGATGCGATCATACCAGCACTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGTGCTTGGGCGAGAGTAGTACTAGGATGGGTGACCTCCTGGGAAGTCCTCGTGTTGCATCCCTC +>gi|186498419|ref|NR_022453.1| Arabidopsis thaliana (AT2G01020) rRNA +AAAACGACTCTCGGCAACGGATATCTCGGCTCTCGCATCGATGAAGAACGTAGCGAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCCAAGCCTTCTGGCCGAGGGCACGTCTGCCTGGGTGTCACAA \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/read_file.py Mon Oct 20 14:56:56 2014 -0400 @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + + + +def read_t_file(in_file): + f = open(in_file); + result = []; + for aline in f.readlines(): + temp = []; + tline = aline.strip(); + tl = tline.split('\t'); + for i in range(0, len(tl)): + temp.append(tl[i].strip()); + result.append(temp); + f.close(); + return result; + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/rtts_plot.py Mon Oct 20 14:56:56 2014 -0400 @@ -0,0 +1,56 @@ +#!/usr/bin/env python +#Make a plot of reactivity distribution + +import sys +import numpy as np +import matplotlib +from pylab import * +import math + +#Convert the reactivities (Make NA to 0) +def convert_react(a): + r = [] + for i in range(len(a)): + if a[i]!='NA': + r.append(float(a[i])) + else: + r.append(float(0)) + return r + + +#Make a plot of the distribution +def make_plot(ar,id_s,path): + N = len(ar) + a = convert_react(ar) + w = 1 + ind = np.arange(N) + + fig = figure() + fig, ax = subplots() + ax.bar(ind+w, a, width = w, color = 'r',edgecolor = 'r') + ax.set_ylabel('DMS Reactivity') + ax.set_xlabel('Nucleotide Index') + + + mag = int(math.log(N,10))-1 + tail = 10**mag + + intervel = int(math.ceil(float(N)/tail)/5) + print(N) + print(intervel) + tl = [] + k = 0 + ax.set_xticks(np.arange(0,N,intervel*tail)) + print(np.arange(0,N,intervel*tail)) + ax.set_xticklabels(np.arange(0,N,intervel*tail)) + + ax.set_title(id_s+" reactivity distribution") + savefig(path+id_s+'.tif') + + + + + + + +