# HG changeset patch # User tyty # Date 1418112210 18000 # Node ID 63c41304b2210e1b3f63448bd87ce80f77c7d8f0 # Parent 1c325ff557d93db7e9d2b18305ade522b1020038 Uploaded diff -r 1c325ff557d9 -r 63c41304b221 predict/.DS_Store Binary file predict/.DS_Store has changed diff -r 1c325ff557d9 -r 63c41304b221 predict/._predict_RNAs.xml Binary file predict/._predict_RNAs.xml has changed diff -r 1c325ff557d9 -r 63c41304b221 predict/parse_dis_pac.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/parse_dis_pac.py Tue Dec 09 03:03:30 2014 -0500 @@ -0,0 +1,43 @@ +#parse reactivity file into a dictionary + +import sys + +def parse_dist(in_file): + result = [] + distribution = {} + name = [] + f = open(in_file) + for aline in f.readlines(): + line = aline.strip() + dis = line.strip() + dist = dis.split('\t') #split the line and the reactivites or reads are in a list + if len(dist) > 0: + if len(dist) == 1: + if dist[0].strip().find('coverage')==-1: + name.append(line) #add the name in the name list + flag = 1 + t_name = line + else: + distri = [] + for i in range(0, len(dist)): + distri.append(dist[i].strip()) + distribution[t_name] = distri #add the list of reactivities into a dictionary + result.append(name) + result.append(distribution) #Output the dictionary + f.close() + return result + + + + + + + + + + + + + + + diff -r 1c325ff557d9 -r 63c41304b221 predict/parse_dis_pac.pyc Binary file predict/parse_dis_pac.pyc has changed diff -r 1c325ff557d9 -r 63c41304b221 predict/predict_RNAs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/predict_RNAs.py Tue Dec 09 03:03:30 2014 -0500 @@ -0,0 +1,102 @@ +#RNA structure prediction & Output and illustrate reactivities + +import sys +import shlex +import subprocess +import tarfile +from parse_dis_pac import * +from read_file import * +from Bio import SeqIO +import os +from rtts_plot import * +import random +import string + + +id_file = sys.argv[1] +seq_file = sys.argv[2] +predict_type = sys.argv[3] +temperature = sys.argv[4] +output_file = sys.argv[5] + + +flag = False +if predict_type!='silico': #input reactivity file if provided + react_file = sys.argv[6] + slope = sys.argv[7] + intercept = sys.argv[8] + react = parse_dist(react_file) + react = react[1] + flag = True + +syspath = os.getcwd() + +ids = read_t_file(id_file) +sequences = SeqIO.parse(seq_file, 'fasta') + + +seqs = {} +for seq in sequences: + seqs[seq.id] = seq.seq.tostring() + +if len(ids)>100: #setup a limit of the number of sequence to be predicted + print("Number of sequences exceeds limitation!") + sys.exit(0) + + +#predict RNA structures +output_directory = os.path.join(syspath, "output_files") +if not os.path.exists(output_directory): + os.makedirs(output_directory) +for i in range(len(ids)): + flag2 = 0 + id_s = ids[i][0] + #print(id_s) + #Put RNA sequence and reactivities into files + if id_s in seqs: + fh = file(os.path.join(syspath,"temp.txt"), 'w') + fh.write('>'+id_s) + fh.write('\n') + fh.write(seqs[id_s]) + fh.close() + if not flag: + command = shlex.split('Fold %s -T %s %s' % (os.path.join(syspath, 'temp.txt'), temperature, os.path.join(output_directory, '%s.ct' % id_s))) + subprocess.call(command) + else: + if id_s in react: + fh = file(os.path.join(syspath, "constraint.txt"), 'w') + make_plot(react[id_s], id_s, output_directory) #make a plot of the distribution of the reactivites of the input RNA + for j in range(0, (len(react[id_s]))): + if react[id_s][j]!='NA': + fh.write(str(j+1)) + fh.write('\t') + fh.write(str(react[id_s][j])) + fh.write('\n') + #h.write(str(react[id_s][j])) #Output the reactivities + #h.write('\t') + fh.close() + #h.write('\n') + #h.write('\n') + command = shlex.split("Fold %s -sh %s -si %s -sm %s -T %s %s" % (os.path.join(syspath, "temp.txt"), + os.path.join(syspath, "constraint.txt"), intercept, slope, temperature, + os.path.join(output_directory, "%s.ct" % id_s))) + subprocess.call(command) + else: + print(id_s+" not in the data of react!") + flag2 = 1 + if flag2 == 0: + command = shlex.split('draw %s.ct %s.ps' % (os.path.join(output_directory, id_s), os.path.join(output_directory, id_s))) + subprocess.call(command) + else: + print(id_s+" not in the data of sequences!") + +#Remove the unnecessary files +tarball = tarfile.open(output_file, 'w:') +for filename in os.listdir(output_directory): + filepath = os.path.join(output_directory, filename) + print filepath + tarball.add(filepath, arcname=filename) +#print os.listdir(syspath) +#print os.listdir(output_directory) +# tarball.add('%s.tif' % os.path.join(syspath, id_s), arcname='%s.tif' % id_s) +tarball.close() diff -r 1c325ff557d9 -r 63c41304b221 predict/predict_RNAs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/predict_RNAs.xml Tue Dec 09 03:03:30 2014 -0500 @@ -0,0 +1,79 @@ + + + + #if $reactivity.type == "restraint" + predict_RNAs.py $rna_list $reference_file $reactivity.type $temperature $output $reactivity.reactivity_file $reactivity.slope $reactivity.intercept + #else + predict_RNAs.py $rna_list $reference_file $reactivity.type $temperature $output + #end if + + + biopython + numpy + matplotlib + + + + + + + + + + + + + + + + + + + + + + + + + + +**TIPS**: + +----- + +**Input**: + +* 1. A file with transcript Ids (Max num. 100), (each ID one line) +* 2. Reference file (fasta) used to map the reads to +* 3. Temperature for RNA structure prediction +* [Optional]: +* 1. A reactivity file with structural reactivity for each nucleotide on the sequence provided +* 2. Slope used with structural restraints (default 1.8) +* 3. Intercept used with structural restraints (default -0.6) + +----- + +**Output**: + +* 1. .ct files with predicted RNA structures [transciptID.ct] +* 2. .ps files which depict the predicted RNA structures [[transciptID.ps] +* [Optional] +* 3. .png files that shows the distribution of the reactivity of each nucleotide on the transcripts of interest. [transciptID.png] + +----- + +**Attention** + +Make sure any of the transcript Ids does not contain "|" or space! + +----- + +**Backend program**: + +* 1. This module is using RNAstructure (http://rna.urmc.rochester.edu/RNAstructure.html) as the backend program to predict RNA structures. +* 2. Default parameters are used for RNAstructure expect -T (Temperature), -sm (slope used with SHAPE restraints) and -si (intercept used with SHAPE restraints) which users can specify the value + + + + + diff -r 1c325ff557d9 -r 63c41304b221 predict/rRNA.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/rRNA.txt Tue Dec 09 03:03:30 2014 -0500 @@ -0,0 +1,8 @@ +>25s rRNA 3375nts +GCGACCCCAGGTCAGGCGGGATTACCCGCTGAGTTTAAGCATATCAATAAGCGGAGGAAAAGAAACTAACAAGGATTCCCTTAGTAACGGCGAGCGAACCGGGAAGAGCCCAGCTTGAAAATCGGACGTCTTCGGCGTTCGAATTGTAGTCTGGAGAAGCGTCCTCAGCGACGGACCGGGCCTAAGTTCCCTGGAAAGGGGCGCCAGAGAGGGTGAGAGCCCGTCGTGCCCGGACCCTGTCGCACCACGAGGCGCTGTCTACGAGTCGGGTTGTTTGGGAATGCAGCCCCAATCGGGCGGTAAATTCCGTCCAAGGCTAAATACGGGCGAGAGACCGATAGCGAACAAGTACCGCGAGGTAAAGATGAAAAGGACTTTGAAAAGAGAGTCAAAGAGTGCTTGAAATTGTCGGGAGGGAAGCGGATGGGGGCCGGCGATGCGTCCTGGTCGGATGCGGAACGGAGCAATCCGGTCCGCCGATCGATTCGGGGCGTGGACCGACGCGGATTACGGTGGCGGCCTAAGCCCGGGCTTTTGATACGCTTGTGGAGACGTCGCTGCCGTGATCGTGGTCTGCAGCACGCGCCTAACGGCGTGCCTCGGCATCAGCGTGCTCCGGGCGTCGGCCTGTGGGCTCCCCATTCGACCCGTCTTGAAACACGGACCAAGGAGTCTGACATGTGTGCGAGTCAACGGGTGAGTAAACCCGTAAGGCGCAAGGAAGCTGATTGGCGGGATCCTCGCGGGTGCACCGCCGACCGACCTTGATCTTCTGAGAAGGGTTCGAGTGTGAGCATGCCTGTCGGGACCCGAAAGATGGTGAACTATGCCTGAGCGGGGTAAAGCCAGAGGAAACTCTGGTGGAAGCCCGCAGCGATACTGACGTGCAAATCGTTCGTCTGACTTGGGTATAGGGGCGAAAGACTAATCGAACCATCTAGTAGCTGGTTCCCTCCGAAGTTTCCCTCAGGATAGCTGGAGCTCGGACGCGAGTTCTATCGGGTAAAGCCAATGATTAGAGGCATTGGGGGCGCAACGCCTCGACCTATTCTCAAACTTTAAATAGGTAGGACGTGTCGGCTGCTTTGTTGAGCCGTCACACGGAATCGAGAGCTCCAAGTGGGCCATTTTTGGTAAGCAGAACTGGCGATGCGGGATGAACCGGAAGCCGGGTTACGGTGCCCAACTGCGCGCTAACCTAGAACCCACAAAGGGTGTTGGTCGATTAAGACAGCAGGACGGTGGTCATGGAAGTCGAAATCCGCTAAGGAGTGTGTAACAACTCACCTGCCGAATCAACTAGCCCCGAAAATGGATGGCGCTTAAGCGCGACCTATACCCGGCCGTCGGGGCAAGAGCCAGGCCTCGATGAGTAGGAGGGCGCGGCGGTCGCTGCAAAACCTAGGGCGCGAGGCGCGGAGCGGCCGTCGGTGCAGATCTTGGTGGTAGTAGCAAATATTCAAATGAGAACTTTGAAGGCCGAAGAGGGGAAAGGTTCCATGTGAACGGCACTTGCACATGGGTTAGTCGATCCTAAGAGTCGGGGGAAACCCGTCTGATAGCGCTTAAGCGAACTTCGAAAGGGGATCCGGTTAAAATTCCGGAACCGGGACGTGGCGGTTGACGGCAACGTTAGGGAGTCCGGAGACGTCGGCGGGGGCCTCGGGAAGAGTTATCTTTTCTGTTTAACAGCCTGCCCACCCTGGAAACGGCTCAGCCGGAGGTAGGGTCCAGCGGCTGGAAGAGCACCGCACGTCGCGTGGTGTCCGGTGCGCCCCCGGGCGCCCTTGAAAATCCGGAGGACCGAGTGCCGCTCACGCCCGGTCGTACTCATAACCGCATCAGGTCTCCAAGGTGAACAGCCTCTGGTCGATGGAACAATGTAGGCAAGGGAAGTCGGCAAAATGGATCCGTAACTTCGGGAAAAGGATTGGCTCTGAGGGCTGGGCTCGGGGGTCCCAGTTCCGAACCCGTCGGCTGTCAGCGGACTGCTCGAGCTGCTTCCGCGGCGAGAGCGGGTCGCCGGCTGCCGGCCGGGGGACGACTGGGAACGGCTCTCTCGGGAGCTTTCCCCGGGCGTCGAACAGTCAGCTCAGAACTGGTACGGACAAGGGGAATCCGACTGTTTAATTAAAACAAAGCATTGCGATGGTCCCTGCGGATGCTAACGCAATGTGATTTCTGCCCAGTGCTCTGAATGTCAAAGTGAAGAAATTCAACCAAGCGCGGGTAAACGGCGGGAGTAACTATGACTCTCTTAAGGTAGCCAAATGCCTCGTCATCTAATTAGTGACGCGCATGAATGGATTAACGAGATTCCCACTGTCCCTGTCTACTATCCAGCGAAACCACAGCCAAGGGAACGGGCTTGGCAGAATCAGCGGGGAAAGAAGACCCTGTTGAGCTTGACTCTAGTCCGACTTTGTGAAATGACTTGAGAGGTGTAGGATAAGTGGGAGCTTCGGCGCAAGTGAAATACCACTACTTTTAACGTTATTTTACTTACTCCGTGAATCGGAGGCCGGGGTACAACCCCTGTTTTTGGTCCCAAGGCTCGCTTCGGCGGGTCGATCCGGGCGGAGGACATTGTCAGGTGGGGAGTTTGGCTGGGGCGGCACATCTGTTAAAAGATAACGCAGGTGTCCTAAGATGAGCTCAACGAGAACAGAAATCTCGTGTGGAACAAAAGGGTAAAAGCTCGTTTGATTCTGATTTTCAGTACGAATACGAACCGTGAAAGCGTGGCCTATCGATCCTTTAGACTTCGGAATTTGAAGCTAGAGGTGTCAGAAAAGTTACCACAGGGATAACTGGCTTGTGGCAGCCAAGCGTTCATAGCGACGTTGCTTTTTGATCCTTCGATGTCGGCTCTTCCTATCATTGTGAAGCAGAATTCACCAAGTGTTGGATTGTTCACCCACCAATAGGGAACGTGAGCTGGGTTTAGACCGTCGTGAGACAGGTTAGTTTTACCCTACTGATGCCCGCGTCGCGATAGTAATTCAACCTAGTACGAGAGGAACCGTTGATTCGCACAATTGGTCATCGCGCTTGGTTGAAAAGCCAGTGGCGCGAAGCTACCGTGCGCTGGATTATGACTGAACGCCTCTAAGTCAGAATCCGGGCTAGAAGCGACGCATGCGCCCGCCGCCCGATTGCCGACCCTCAGTAGGAGCTTAGGCTCCAAAGGCACGTGTCGTTGGCTAAGTCCGTTCGGCGGAACGGTCGTTCGGACCGCCTTGAATTATAATTACCACCGAGCGGCGGGTAGAATCCTTTGCAGACGACTTAAATACGCGACGGGGTATTGTAAGTGGCAGAGTGGCCTTGCTGCCACGATCCACTGAGATTCAGCCCTTTGTCGCTAAGATTCGA +>gi|20197903:2706-4513 Arabidopsis thaliana chromosome 2 BAC F23H14 genomic sequence, complete sequence +TACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTAAGCCATGCATGTGTAAGTATGAACGAATTCAGACTGTGAAACTGCGAATGGCTCATTAAATCAGTTATAGTTTGTTTGATGGTAACTACTACTCGGATAACCGTAGTAATTCTAGAGCTAATACGTGCAACAAACCCCGACTTATGGAAGGGACGCATTTATTAGATAAAAGGTCGACGCGGGCTCTGCCCGTTGCTCTGATGATTCATGATAACTCGACGGATCGCATGGCCTCTGTGCTGGCGACGCATCATTCAAATTTCTGCCCTATCAACTTTCGATGGTAGGATAGTGGCCTACCATGGTGGTAACGGGTGACGGAGAATTAGGGTTCGATTCCGGAGAGGGAGCCTGAGAAACGGCTACCACATCCAAGGAAGGCAGCAGGCGCGCAAATTACCCAATCCTGACACGGGGAGGTAGTGACAATAAATAACAATACTGGGCTCTTTCGAGTCTGGTAATTGGAATGAGTACAATCTAAATCCCTTAACGAGGATCCATTGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTCCAATAGCGTATATTTAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAACCTTGGGATGGGTCGGCCGGTCCGCCTTTGGTGTGCATTGGTCGGCTTGTCCCTTCGGTCGGCGATACGCTCCTGGTCTTAATTGGCCGGGTCGTGCCTCCGGCGCTGTTACTTTGAAGAAATTAGAGTGCTCAAAGCAAGCCTACGCTCTGGATACATTAGCATGGGATAACATCATAGGATTTCGATCCTATTGTGTTGGCCTTCGGGATCGGAGTAATGATTAACAGGGACAGTCGGGGGCATTCGTATTTCATAGTCAGAGGTGAAATTCTTGGATTTATGAAAGACGAACAACTGCGAAAGCATTTGCCAAGGATGTTTTCATTAATCAAGAACGAAAGTTGGGGGCTCGAAGACGATCAGATACCGTCCTAGTCTCAACCATAAACGATGCCGACCAGGGATCAGCGGATGTTGCTTATAGGACTCCGCTGGCACCTTATGAGAAATCAAAGTTTTTGGGTTCCGGGGGGAGTATGGTCGCAAGGCTGAAACTTAAAGGAATTGACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGGAAACTTACCAGGTCCAGACATAGTAAGGATTGACAGACTGAGAGCTCTTTCTTGATTCTATGGGTGGTGGTGCATGGCCGTTCTTAGTTGGTGGAGCGATTTGTCTGGTTAATTCCGTTAATGAACGAGACCTCAGCCTGCTAACTAGCTACGTGGAGGCATCCCTTCACGGCCGGCTTCTTAGAGGGACTATGGCCGTTTAGGCCAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGATGTATTCAACGAGTTCACACCTTGGCCGACAGGCCCGGGTAATCTTTGAAATTTCATCGTGATGGGGATAGATCATTGCAATTGTTGGTCTTCAACGAGGAATTCCTAGTAAGCGCGAGTCATCAGCTCGCGTTGACTACGTCCCTGCCCTTTGTACACACCGCCCGTCGCTCCTACCGATTGAATGATCCGGTGAAGTGTTCGGATCGCGGCGACGTGGGTGGTTCGCCGCCCGCGACGTCGCGAGAAGTCCACTAAACCTTATCATTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTG +>Arabidopsis thaliana 1 +GGATGCGATCATACCAGCACTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGTGCTTGGGCGAGAGTAGTACTAGGATGGGTGACCTCCTGGGAAGTCCTCGTGTTGCATCCCTC +>gi|186498419|ref|NR_022453.1| Arabidopsis thaliana (AT2G01020) rRNA +AAAACGACTCTCGGCAACGGATATCTCGGCTCTCGCATCGATGAAGAACGTAGCGAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCCAAGCCTTCTGGCCGAGGGCACGTCTGCCTGGGTGTCACAA \ No newline at end of file diff -r 1c325ff557d9 -r 63c41304b221 predict/read_file.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/read_file.py Tue Dec 09 03:03:30 2014 -0500 @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + + + +def read_t_file(in_file): + f = open(in_file); + result = []; + for aline in f.readlines(): + temp = []; + tline = aline.strip(); + tl = tline.split('\t'); + for i in range(0, len(tl)): + temp.append(tl[i].strip()); + result.append(temp); + f.close(); + return result; + + diff -r 1c325ff557d9 -r 63c41304b221 predict/read_file.pyc Binary file predict/read_file.pyc has changed diff -r 1c325ff557d9 -r 63c41304b221 predict/rtts_plot.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict/rtts_plot.py Tue Dec 09 03:03:30 2014 -0500 @@ -0,0 +1,60 @@ +#!/usr/bin/env python +#Make a plot of reactivity distribution + +import sys +import os +import numpy as np +import matplotlib +from pylab import * +import math + +#Convert the reactivities (Make NA to 0) +def convert_react(a): + r = [] + for i in range(len(a)): + if a[i]!='NA': + r.append(float(a[i])) + else: + r.append(float(0)) + return r + + +#Make a plot of the distribution +def make_plot(ar,id_s,path): + font = {'family' : 'normal', + 'weight' : 'bold', + 'size' : 16} + matplotlib.rc('font', **font) + N = len(ar) + a = convert_react(ar) + w = 1 + ind = np.arange(N) + + fig = figure() + fig, ax = subplots() + ax.bar(ind+w, a, width = w, color = 'black',edgecolor = 'black') + ax.set_ylabel('Final Structural Reactivity (FSR)') + ax.set_xlabel('Nucleotide Number') + + + mag = int(math.log(N,10))-1 + tail = 10**mag + + intervel = int(math.ceil(float(N)/tail/5)) + print(N) + print(intervel) + tl = [] + k = 0 + upmax = int(math.ceil(float(N)/intervel/tail)*intervel*tail)+1 + ax.set_xticks(np.arange(0,upmax,intervel*tail)) + print(np.arange(0,upmax,intervel*tail)) + ax.set_xticklabels(np.arange(0,upmax,intervel*tail)) + savefig(os.path.join(path, id_s+'.tif')) + + + + + + + + diff -r 1c325ff557d9 -r 63c41304b221 predict/rtts_plot.pyc Binary file predict/rtts_plot.pyc has changed diff -r 1c325ff557d9 -r 63c41304b221 reactivity_cal/._.DS_Store Binary file reactivity_cal/._.DS_Store has changed diff -r 1c325ff557d9 -r 63c41304b221 reactivity_cal/parse_dis_react.py --- a/reactivity_cal/parse_dis_react.py Tue Dec 09 03:03:11 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import sys - -def parse_dist(in_file): - result = [] - distribution = {} - name = [] - f = open(in_file) - flag = 0 - for aline in f.readlines(): - line = aline.strip() - dis = line.strip() - dist = dis.split('\t') - if len(dist) > 0: - if len(dist) == 1: - if dist[0].strip().find('coverage')==-1: - if flag == 0: - name.append(line) - flag = 1 - t_name = line - else: - distribution[t_name] = 'null' - name.append(line) - flag = 1 - t_name = line - else: - distri = [] - for i in range(0, len(dist)): - distri.append(dist[i].strip()) - distribution[t_name] = distri - flag = 0 - result.append(name) - result.append(distribution) - f.close() - return result - - - - - - - - - - - - - - - diff -r 1c325ff557d9 -r 63c41304b221 reactivity_cal/parse_dis_react.pyc Binary file reactivity_cal/parse_dis_react.pyc has changed diff -r 1c325ff557d9 -r 63c41304b221 reactivity_cal/rRNA.txt --- a/reactivity_cal/rRNA.txt Tue Dec 09 03:03:11 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ ->25s rRNA 3375nts -GCGACCCCAGGTCAGGCGGGATTACCCGCTGAGTTTAAGCATATCAATAAGCGGAGGAAAAGAAACTAACAAGGATTCCCTTAGTAACGGCGAGCGAACCGGGAAGAGCCCAGCTTGAAAATCGGACGTCTTCGGCGTTCGAATTGTAGTCTGGAGAAGCGTCCTCAGCGACGGACCGGGCCTAAGTTCCCTGGAAAGGGGCGCCAGAGAGGGTGAGAGCCCGTCGTGCCCGGACCCTGTCGCACCACGAGGCGCTGTCTACGAGTCGGGTTGTTTGGGAATGCAGCCCCAATCGGGCGGTAAATTCCGTCCAAGGCTAAATACGGGCGAGAGACCGATAGCGAACAAGTACCGCGAGGTAAAGATGAAAAGGACTTTGAAAAGAGAGTCAAAGAGTGCTTGAAATTGTCGGGAGGGAAGCGGATGGGGGCCGGCGATGCGTCCTGGTCGGATGCGGAACGGAGCAATCCGGTCCGCCGATCGATTCGGGGCGTGGACCGACGCGGATTACGGTGGCGGCCTAAGCCCGGGCTTTTGATACGCTTGTGGAGACGTCGCTGCCGTGATCGTGGTCTGCAGCACGCGCCTAACGGCGTGCCTCGGCATCAGCGTGCTCCGGGCGTCGGCCTGTGGGCTCCCCATTCGACCCGTCTTGAAACACGGACCAAGGAGTCTGACATGTGTGCGAGTCAACGGGTGAGTAAACCCGTAAGGCGCAAGGAAGCTGATTGGCGGGATCCTCGCGGGTGCACCGCCGACCGACCTTGATCTTCTGAGAAGGGTTCGAGTGTGAGCATGCCTGTCGGGACCCGAAAGATGGTGAACTATGCCTGAGCGGGGTAAAGCCAGAGGAAACTCTGGTGGAAGCCCGCAGCGATACTGACGTGCAAATCGTTCGTCTGACTTGGGTATAGGGGCGAAAGACTAATCGAACCATCTAGTAGCTGGTTCCCTCCGAAGTTTCCCTCAGGATAGCTGGAGCTCGGACGCGAGTTCTATCGGGTAAAGCCAATGATTAGAGGCATTGGGGGCGCAACGCCTCGACCTATTCTCAAACTTTAAATAGGTAGGACGTGTCGGCTGCTTTGTTGAGCCGTCACACGGAATCGAGAGCTCCAAGTGGGCCATTTTTGGTAAGCAGAACTGGCGATGCGGGATGAACCGGAAGCCGGGTTACGGTGCCCAACTGCGCGCTAACCTAGAACCCACAAAGGGTGTTGGTCGATTAAGACAGCAGGACGGTGGTCATGGAAGTCGAAATCCGCTAAGGAGTGTGTAACAACTCACCTGCCGAATCAACTAGCCCCGAAAATGGATGGCGCTTAAGCGCGACCTATACCCGGCCGTCGGGGCAAGAGCCAGGCCTCGATGAGTAGGAGGGCGCGGCGGTCGCTGCAAAACCTAGGGCGCGAGGCGCGGAGCGGCCGTCGGTGCAGATCTTGGTGGTAGTAGCAAATATTCAAATGAGAACTTTGAAGGCCGAAGAGGGGAAAGGTTCCATGTGAACGGCACTTGCACATGGGTTAGTCGATCCTAAGAGTCGGGGGAAACCCGTCTGATAGCGCTTAAGCGAACTTCGAAAGGGGATCCGGTTAAAATTCCGGAACCGGGACGTGGCGGTTGACGGCAACGTTAGGGAGTCCGGAGACGTCGGCGGGGGCCTCGGGAAGAGTTATCTTTTCTGTTTAACAGCCTGCCCACCCTGGAAACGGCTCAGCCGGAGGTAGGGTCCAGCGGCTGGAAGAGCACCGCACGTCGCGTGGTGTCCGGTGCGCCCCCGGGCGCCCTTGAAAATCCGGAGGACCGAGTGCCGCTCACGCCCGGTCGTACTCATAACCGCATCAGGTCTCCAAGGTGAACAGCCTCTGGTCGATGGAACAATGTAGGCAAGGGAAGTCGGCAAAATGGATCCGTAACTTCGGGAAAAGGATTGGCTCTGAGGGCTGGGCTCGGGGGTCCCAGTTCCGAACCCGTCGGCTGTCAGCGGACTGCTCGAGCTGCTTCCGCGGCGAGAGCGGGTCGCCGGCTGCCGGCCGGGGGACGACTGGGAACGGCTCTCTCGGGAGCTTTCCCCGGGCGTCGAACAGTCAGCTCAGAACTGGTACGGACAAGGGGAATCCGACTGTTTAATTAAAACAAAGCATTGCGATGGTCCCTGCGGATGCTAACGCAATGTGATTTCTGCCCAGTGCTCTGAATGTCAAAGTGAAGAAATTCAACCAAGCGCGGGTAAACGGCGGGAGTAACTATGACTCTCTTAAGGTAGCCAAATGCCTCGTCATCTAATTAGTGACGCGCATGAATGGATTAACGAGATTCCCACTGTCCCTGTCTACTATCCAGCGAAACCACAGCCAAGGGAACGGGCTTGGCAGAATCAGCGGGGAAAGAAGACCCTGTTGAGCTTGACTCTAGTCCGACTTTGTGAAATGACTTGAGAGGTGTAGGATAAGTGGGAGCTTCGGCGCAAGTGAAATACCACTACTTTTAACGTTATTTTACTTACTCCGTGAATCGGAGGCCGGGGTACAACCCCTGTTTTTGGTCCCAAGGCTCGCTTCGGCGGGTCGATCCGGGCGGAGGACATTGTCAGGTGGGGAGTTTGGCTGGGGCGGCACATCTGTTAAAAGATAACGCAGGTGTCCTAAGATGAGCTCAACGAGAACAGAAATCTCGTGTGGAACAAAAGGGTAAAAGCTCGTTTGATTCTGATTTTCAGTACGAATACGAACCGTGAAAGCGTGGCCTATCGATCCTTTAGACTTCGGAATTTGAAGCTAGAGGTGTCAGAAAAGTTACCACAGGGATAACTGGCTTGTGGCAGCCAAGCGTTCATAGCGACGTTGCTTTTTGATCCTTCGATGTCGGCTCTTCCTATCATTGTGAAGCAGAATTCACCAAGTGTTGGATTGTTCACCCACCAATAGGGAACGTGAGCTGGGTTTAGACCGTCGTGAGACAGGTTAGTTTTACCCTACTGATGCCCGCGTCGCGATAGTAATTCAACCTAGTACGAGAGGAACCGTTGATTCGCACAATTGGTCATCGCGCTTGGTTGAAAAGCCAGTGGCGCGAAGCTACCGTGCGCTGGATTATGACTGAACGCCTCTAAGTCAGAATCCGGGCTAGAAGCGACGCATGCGCCCGCCGCCCGATTGCCGACCCTCAGTAGGAGCTTAGGCTCCAAAGGCACGTGTCGTTGGCTAAGTCCGTTCGGCGGAACGGTCGTTCGGACCGCCTTGAATTATAATTACCACCGAGCGGCGGGTAGAATCCTTTGCAGACGACTTAAATACGCGACGGGGTATTGTAAGTGGCAGAGTGGCCTTGCTGCCACGATCCACTGAGATTCAGCCCTTTGTCGCTAAGATTCGA ->gi|20197903:2706-4513 Arabidopsis thaliana chromosome 2 BAC F23H14 genomic sequence, complete sequence -TACCTGGTTGATCCTGCCAGTAGTCATATGCTTGTCTCAAAGATTAAGCCATGCATGTGTAAGTATGAACGAATTCAGACTGTGAAACTGCGAATGGCTCATTAAATCAGTTATAGTTTGTTTGATGGTAACTACTACTCGGATAACCGTAGTAATTCTAGAGCTAATACGTGCAACAAACCCCGACTTATGGAAGGGACGCATTTATTAGATAAAAGGTCGACGCGGGCTCTGCCCGTTGCTCTGATGATTCATGATAACTCGACGGATCGCATGGCCTCTGTGCTGGCGACGCATCATTCAAATTTCTGCCCTATCAACTTTCGATGGTAGGATAGTGGCCTACCATGGTGGTAACGGGTGACGGAGAATTAGGGTTCGATTCCGGAGAGGGAGCCTGAGAAACGGCTACCACATCCAAGGAAGGCAGCAGGCGCGCAAATTACCCAATCCTGACACGGGGAGGTAGTGACAATAAATAACAATACTGGGCTCTTTCGAGTCTGGTAATTGGAATGAGTACAATCTAAATCCCTTAACGAGGATCCATTGGAGGGCAAGTCTGGTGCCAGCAGCCGCGGTAATTCCAGCTCCAATAGCGTATATTTAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAACCTTGGGATGGGTCGGCCGGTCCGCCTTTGGTGTGCATTGGTCGGCTTGTCCCTTCGGTCGGCGATACGCTCCTGGTCTTAATTGGCCGGGTCGTGCCTCCGGCGCTGTTACTTTGAAGAAATTAGAGTGCTCAAAGCAAGCCTACGCTCTGGATACATTAGCATGGGATAACATCATAGGATTTCGATCCTATTGTGTTGGCCTTCGGGATCGGAGTAATGATTAACAGGGACAGTCGGGGGCATTCGTATTTCATAGTCAGAGGTGAAATTCTTGGATTTATGAAAGACGAACAACTGCGAAAGCATTTGCCAAGGATGTTTTCATTAATCAAGAACGAAAGTTGGGGGCTCGAAGACGATCAGATACCGTCCTAGTCTCAACCATAAACGATGCCGACCAGGGATCAGCGGATGTTGCTTATAGGACTCCGCTGGCACCTTATGAGAAATCAAAGTTTTTGGGTTCCGGGGGGAGTATGGTCGCAAGGCTGAAACTTAAAGGAATTGACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTTGACTCAACACGGGGAAACTTACCAGGTCCAGACATAGTAAGGATTGACAGACTGAGAGCTCTTTCTTGATTCTATGGGTGGTGGTGCATGGCCGTTCTTAGTTGGTGGAGCGATTTGTCTGGTTAATTCCGTTAATGAACGAGACCTCAGCCTGCTAACTAGCTACGTGGAGGCATCCCTTCACGGCCGGCTTCTTAGAGGGACTATGGCCGTTTAGGCCAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGATGTATTCAACGAGTTCACACCTTGGCCGACAGGCCCGGGTAATCTTTGAAATTTCATCGTGATGGGGATAGATCATTGCAATTGTTGGTCTTCAACGAGGAATTCCTAGTAAGCGCGAGTCATCAGCTCGCGTTGACTACGTCCCTGCCCTTTGTACACACCGCCCGTCGCTCCTACCGATTGAATGATCCGGTGAAGTGTTCGGATCGCGGCGACGTGGGTGGTTCGCCGCCCGCGACGTCGCGAGAAGTCCACTAAACCTTATCATTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTG ->Arabidopsis thaliana 1 -GGATGCGATCATACCAGCACTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGTGCTTGGGCGAGAGTAGTACTAGGATGGGTGACCTCCTGGGAAGTCCTCGTGTTGCATCCCTC ->gi|186498419|ref|NR_022453.1| Arabidopsis thaliana (AT2G01020) rRNA -AAAACGACTCTCGGCAACGGATATCTCGGCTCTCGCATCGATGAAGAACGTAGCGAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTCTTTGAACGCAAGTTGCGCCCCAAGCCTTCTGGCCGAGGGCACGTCTGCCTGGGTGTCACAA \ No newline at end of file diff -r 1c325ff557d9 -r 63c41304b221 reactivity_cal/react_cal.py --- a/reactivity_cal/react_cal.py Tue Dec 09 03:03:11 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,135 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import sys -from Bio import SeqIO -import math -from parse_dis_react import * -from react_norm_function import * -import os -import random -import string - - -dist_file1 = sys.argv[1] #plus library -dist_file2 = sys.argv[2] #minus library -seq_file = sys.argv[3] #Reference library(genome/cDNA) -nt_spec = sys.argv[4] #only show reactivity for AC or ATCG -flag_in = sys.argv[5] # perform 2-8% normalization (1) or not (0) -threshold = sys.argv[6] #Threshold to cap the reactivities -output_file = sys.argv[7] - - -distri_p = parse_dist(dist_file1) -distri_m = parse_dist(dist_file2) -threshold = float(threshold) - - -syspathrs = os.getcwd() - -h = file(syspathrs+"react.txt",'w') -flag_in = int(flag_in) - -seqs = SeqIO.parse(open(seq_file),'fasta'); -nt_s = set() -for i in range(len(nt_spec)): - nt_s.add(nt_spec[i]) - -flag = 0 -trans = [] -distri_p = distri_p[1] -distri_m = distri_m[1] - -#thres = int(threshold) - - -transcripts = {} -for seq in seqs: - n = seq.id - trans.append(n) - transcripts[n] = seq.seq.tostring() - - -#print(distri_p) - - -for i in range(0, len(trans)): - h.write(trans[i]) - h.write('\n') - for j in range(len(distri_p[trans[i]])): - distri_p[trans[i]][j] = math.log((int(distri_p[trans[i]][j])+1),math.e) - for j in range(len(distri_m[trans[i]])): - distri_m[trans[i]][j] = math.log((int(distri_m[trans[i]][j])+1),math.e) - s_p = sum(distri_p[trans[i]]) - s_m = sum(distri_m[trans[i]]) - length = len(distri_p[trans[i]]) - if s_p!= 0 and s_m!= 0: - r = [] - for j in range(0, len(distri_p[trans[i]])): - f_p = (float(distri_p[trans[i]][j]))/float(s_p)*length - f_m = (float(distri_m[trans[i]][j]))/float(s_m)*length - raw_react = f_p-f_m - r.append(max(0, raw_react)) - - if s_p!= 0 and s_m!= 0: - for k in range(1,(len(r)-1)): - if transcripts[trans[i]][k-1] in nt_s: - h.write(str(r[k])) - h.write('\t') - else: - h.write('NA') - h.write('\t') - k = k+1 - if transcripts[trans[i]][k-1] in nt_s: - h.write(str(r[k])) - h.write('\n') - else: - h.write('NA') - h.write('\n') - - -h.close() - -if flag_in: - react_norm((syspathrs+"react.txt"),output_file, threshold) -else: - h_o = file(output_file, 'w') - f_i = open(syspathrs+"react.txt") - for aline in f_i.readlines(): - h_o.write(aline.strip()) - h_o.write('\n') -os.system("rm -f "+syspathrs+"react.txt") - -#os.system("rm -r "+syspathrs) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r 1c325ff557d9 -r 63c41304b221 reactivity_cal/react_norm_function.py --- a/reactivity_cal/react_norm_function.py Tue Dec 09 03:03:11 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,82 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import sys -from Bio import SeqIO -import math -from parse_dis_react import * - -def cap(a,value): - if a>=value: - return value - else: - return a - -def react_norm(react_file, result_file, capped_value): - print("Normalizing.....") - react1 = parse_dist(react_file) - react = react1[1] - h = file(result_file, 'w') - - capped = int(capped_value) - - all_react = [] - - - for t in react: - if react[t]!='null': - for i in range(len(react[t])): - if react[t][i]!='NA': - all_react.append(float(react[t][i])) - - - all_react.sort(reverse = True) - - - eight = all_react[int(len(all_react)*0.02):int(len(all_react)*0.1)] - meight = sum(eight)/len(eight) - - for t in react: - h.write(t) - h.write('\n') - if react[t]!='null': - for i in range((len(react[t])-1)): - if react[t][i]!='NA': - h.write(str(cap((float(react[t][i])/meight),capped))) - else: - h.write('NA') - h.write('\t') - if react[t][i+1]!='NA': - h.write(str(cap((float(react[t][i+1])/meight),capped))) - else: - h.write('NA') - h.write('\n') - - h.close() - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff -r 1c325ff557d9 -r 63c41304b221 reactivity_cal/react_norm_function.pyc Binary file reactivity_cal/react_norm_function.pyc has changed diff -r 1c325ff557d9 -r 63c41304b221 reactivity_cal/reactivity_calculation.xml --- a/reactivity_cal/reactivity_calculation.xml Tue Dec 09 03:03:11 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ - - - react_cal.py $dist_file1 $dist_file2 $seq_file $nt_spec $flag_in $threshold $output - - biopython - numpy - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -**TIPS**: - ------ - -**Input**: - -* 1. RTSC files (Output of Get RT Stop Counts) for (+) and (-) library -* 2. Reference file (fasta) used to map the reads to -* 3. Nucleotide Specificity (Type of nucleotides to have reactivity, e.g. AC for DMS and ACTG for SHAPE) -* [Optional]: -* 1. A threshold to cap the structural reactivities. {Default: 7} -* 2. Flag that determines whether to perform 2%-8% normalization {Default: Yes} - ------ - -**Output**: - -A text file with structural reactivity for each nucleotide (Reactivity file) - - - - - diff -r 1c325ff557d9 -r 63c41304b221 reactivity_cal/read_file.py --- a/reactivity_cal/read_file.py Tue Dec 09 03:03:11 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys - - - -def read_t_file(in_file): - f = open(in_file); - result = []; - for aline in f.readlines(): - temp = []; - tline = aline.strip(); - tl = tline.split('\t'); - for i in range(0, len(tl)): - temp.append(tl[i].strip()); - result.append(temp); - f.close(); - return result; - -