Mercurial > repos > tyty > structurefold
changeset 20:13024d65eee7 draft
Uploaded
author | tyty |
---|---|
date | Mon, 20 Oct 2014 14:31:27 -0400 |
parents | 01da084bba4e |
children | 362993bbcf2b |
files | get_reads/get_read.py get_reads/get_read.xml get_reads/reactivity_cal/.DS_Store get_reads/reactivity_cal/parse_dis_react.py get_reads/reactivity_cal/react_cal.py get_reads/reactivity_cal/react_norm_function.py get_reads/reactivity_cal/reactivity_calculation.xml get_reads/reactivity_cal/read_file.py get_reads/reactivity_cal/separate_rna.py get_reads/reactivity_cal/tool_dependencies.xml get_reads/read_file.py get_reads/test.bam get_reads/tool_dependencies.xml |
diffstat | 13 files changed, 436 insertions(+), 154 deletions(-) [+] |
line wrap: on
line diff
--- a/get_reads/get_read.py Mon Oct 20 02:12:08 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -#from galaxy.tools.read_file import * -from Bio import SeqIO -import os -from read_file import * - -fasta_file = sys.argv[1] -map_file = sys.argv[2] -result_file = sys.argv[3] - -os.system("samtools view -F 0xfff "+map_file+"|cut -f 3,4 > map_info.txt") - -fasta_sequences = SeqIO.parse(open(fasta_file),'fasta'); -length_seq = {}; -for seq in fasta_sequences: - nuc = seq.id; - length_seq[nuc] = len(seq.seq.tostring()); - - - -mapping = {} -transcripts = [] - -f = open("map_info.txt"); -for aline in f.readlines(): - tline = aline.strip(); - tl = tline.split('\t'); - if tl[0].strip() not in transcripts: - transcripts.append(tl[0].strip()); - mapping[tl[0].strip()] = []; - - mapping[tl[0].strip()].append(tl[1].strip()); - -distribution = {}; -coverage = {}; -for transcript in length_seq: - distribution[transcript] = []; - for i in range(0, length_seq[transcript]): - distribution[transcript].append(0); - sum_count = float(0); - if transcript in mapping: - for j in range(0, len(mapping[transcript])): - index = mapping[transcript][j]; - #count = reads[mapping[transcript][j][0]]; - sum_count = sum_count + 1; - distribution[transcript][int(index)-1] = distribution[transcript][int(index)-1] + 1; - coverage[transcript] = float(sum_count)/float(length_seq[transcript]); - else: - coverage[transcript] = 0 - - - - - -h = file(result_file, 'w') -for transcript in length_seq: - h.write(transcript); - h.write('\n') - for i in range(0, length_seq[transcript]): - h.write(str(distribution[transcript][i])) - h.write('\t') - h.write('\n') - h.write('\n') - - - - - -f.close(); -h.close() - - - -
--- a/get_reads/get_read.xml Mon Oct 20 02:12:08 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -<tool id="get_read_pipeline" name="Get RT stop counts" version="1.0"> - <description></description> - <command interpreter="python">get_read.py $lib_file $map_file $output </command> - <requirements> - <requirement type="package" version="1.61">biopython</requirement> - <requirement type="package" version="1.7">numpy</requirement> - <requirement type="package" version="0.1.18">samtools</requirement> - </requirements> - <inputs> - <param name="lib_file" type="data" format="fasta" label="Library file (fasta)"/> - <param name="map_file" type="data" format="bam" label="Mapped file"/> - </inputs> - <outputs> - <data name="output" format="txt"/> - </outputs> - <tests> - <test> - <param name="lib_file" value="test.bam" /> - <param name="map_file" value="com_rna.txt" /> - <output name="output" file="get_RT_stop_test.out" /> - - </test> - </tests> - - <help> - - -**TIPS**: - ------ - -**Input** -1. A mapped (bam) file from Bowtie (or any mapping program) -2. Reference library sequences (fasta) used to map the reads - ------ - -**Output**: -A text file with reverse transcription stop counts mapped to each nucleotide (RTSC file) - - - - </help> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/reactivity_cal/parse_dis_react.py Mon Oct 20 14:31:27 2014 -0400 @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import sys + +def parse_dist(in_file): + result = [] + distribution = {} + name = [] + f = open(in_file) + flag = 0 + for aline in f.readlines(): + line = aline.strip() + dis = line.strip() + dist = dis.split('\t') + if len(dist) > 0: + if len(dist) == 1: + if dist[0].strip().find('coverage')==-1: + if flag == 0: + name.append(line) + flag = 1 + t_name = line + else: + distribution[t_name] = 'null' + name.append(line) + flag = 1 + t_name = line + else: + distri = [] + for i in range(0, len(dist)): + distri.append(dist[i].strip()) + distribution[t_name] = distri + flag = 0 + result.append(name) + result.append(distribution) + f.close() + return result + + + + + + + + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/reactivity_cal/react_cal.py Mon Oct 20 14:31:27 2014 -0400 @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import sys +from Bio import SeqIO +import math +from parse_dis_react import * +from react_norm_function import * +import os + + +dist_file1 = sys.argv[1] #plus library +dist_file2 = sys.argv[2] #minus library +seq_file = sys.argv[3] +nt_spec = sys.argv[4] +flag_in = sys.argv[5] +threshold = sys.argv[6] +output_file = sys.argv[7] + + +distri_p = parse_dist(dist_file1) +distri_m = parse_dist(dist_file2) +threshold = float(threshold) + +print(flag_in) + +ospath = os.path.realpath(sys.argv[0]) +ost = ospath.split('/') +syspath = "" +for i in range(len(ost)-1): + syspath = syspath+ost[i].strip() + syspath = syspath+'/' + +h = file(syspath+"react.txt",'w') +flag_in = int(flag_in) + +seqs = SeqIO.parse(open(seq_file),'fasta'); +nt_s = set() +for i in range(len(nt_spec)): + nt_s.add(nt_spec[i]) + +flag = 0 +trans = [] +distri_p = distri_p[1] +distri_m = distri_m[1] + +#thres = int(threshold) + + +transcripts = {} +for seq in seqs: + n = seq.id + trans.append(n) + transcripts[n] = seq.seq.tostring() + + +#print(distri_p) + + +for i in range(0, len(trans)): + h.write(trans[i]) + h.write('\n') + if (trans[i].find('AT1G29930')==-1) and (trans[i].find('At1g29930')==-1): + for j in range(len(distri_p[trans[i]])): + distri_p[trans[i]][j] = math.log((int(distri_p[trans[i]][j])+1),math.e) + for j in range(len(distri_m[trans[i]])): + distri_m[trans[i]][j] = math.log((int(distri_m[trans[i]][j])+1),math.e) + s_p = sum(distri_p[trans[i]]) + s_m = sum(distri_m[trans[i]]) + length = len(distri_p[trans[i]]) + if s_p!= 0 and s_m!= 0: + r = [] + for j in range(0, len(distri_p[trans[i]])): + f_p = (float(distri_p[trans[i]][j]))/float(s_p)*length + f_m = (float(distri_m[trans[i]][j]))/float(s_m)*length + raw_react = f_p-f_m + r.append(max(0, raw_react)) + else: + for j in range(len(distri_p[trans[i]])): + distri_p[trans[i]][j] = int(distri_p[trans[i]][j]) + for j in range(len(distri_m[trans[i]])): + distri_m[trans[i]][j] = int(distri_m[trans[i]][j]) + s_p = sum(distri_p[trans[i]]) + s_m = sum(distri_m[trans[i]]) + if s_p!= 0 and s_m!= 0: + r = [] + for j in range(0, len(distri_p[trans[i]])): + f_p = float(distri_p[trans[i]][j])/float(s_p) + f_m = float(distri_m[trans[i]][j])/float(s_m) + r.append((max(0,(f_p-f_m)))*100) + + if s_p!= 0 and s_m!= 0: + for k in range(1,(len(r)-1)): + if transcripts[trans[i]][k-1] in nt_s: + h.write(str(r[k])) + h.write('\t') + else: + h.write('NA') + h.write('\t') + k = k+1 + if transcripts[trans[i]][k-1] in nt_s: + h.write(str(r[k])) + h.write('\n') + else: + h.write('NA') + h.write('\n') + + +h.close() + +if flag_in: + react_norm((syspath+"react.txt"),output_file, threshold) +else: + h_o = file(output_file, 'w') + f_i = open(syspath+"react.txt") + for aline in f_i.readlines(): + h_o.write(aline.strip()) + h_o.write('\n') +os.system("rm -f "+syspath+"react.txt") + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/reactivity_cal/react_norm_function.py Mon Oct 20 14:31:27 2014 -0400 @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import sys +from Bio import SeqIO +import math +from parse_dis_react import * + +def cap(a,value): + if a>=value: + return value + else: + return a + +def react_norm(react_file, result_file, capped_value): + print("Normalizing.....") + react1 = parse_dist(react_file) + react = react1[1] + h = file(result_file, 'w') + + capped = int(capped_value) + + all_react = [] + + + for t in react: + if react[t]!='null': + for i in range(len(react[t])): + if react[t][i]!='NA': + all_react.append(float(react[t][i])) +# except: +# print(react[t][i]) +# print(t) +# print(i) + + all_react.sort(reverse = True) + #print((all_react)) + #print(all_react[int(len(all_react)*0.02)]) + #print(all_react[int(len(all_react)*0.03)]) + #print(all_react[int(len(all_react)*0.025)]) + #print(all_react[int(len(all_react)*0.04)]) + #print(all_react[int(len(all_react)*0.05)]) + ''' + mean = sum(all_react)/len(all_react) + print(mean) + temp = 0 + + for i in range(len(all_react)): + temp = temp+all_react[i]*all_react[i] + temp = temp/len(all_react) + sd = math.sqrt(temp-mean*mean) + ''' + eight = all_react[int(len(all_react)*0.02):int(len(all_react)*0.1)] + meight = sum(eight)/len(eight) + + for t in react: + h.write(t) + h.write('\n') + if react[t]!='null': + if (t.find('AT1G29930')==-1) and (t.find('At1g29930')==-1): + for i in range((len(react[t])-1)): + if react[t][i]!='NA': + h.write(str(cap((float(react[t][i])/meight),capped))) + else: + h.write('NA') + h.write('\t') + if react[t][i+1]!='NA': + h.write(str(cap((float(react[t][i+1])/meight),capped))) + else: + h.write('NA') + h.write('\n') + else: + for i in range((len(react[t])-1)): + if react[t][i]!='NA': + h.write(str(float(react[t][i])*2.6)) + else: + h.write('NA') + h.write('\t') + if react[t][i+1]!='NA': + h.write(str(float(react[t][i])*2.6)) + else: + h.write('NA') + h.write('\n') + + + + h.close() + + + + + + + + + + + + + + + + + + + + + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/reactivity_cal/reactivity_calculation.xml Mon Oct 20 14:31:27 2014 -0400 @@ -0,0 +1,48 @@ +<tool id="react_cal_pipeline" name="Reactivity calculation" version="1.0"> + <description></description> + <command interpreter="python">react_cal.py $dist_file1 $dist_file2 $seq_file $nt_spec $flag_in $threshold $output </command> + <requirements> + <requirement type="package" version="1.61">biopython</requirement> + <requirement type="package" version="1.7">numpy</requirement> + </requirements> + <inputs> + <param name="dist_file1" type="data" format="txt" label="RTSC file for (+) library"/> + <param name="dist_file2" type="data" format="txt" label="RTSC file for (-) library"/> + <param name="seq_file" type="data" format="fasta" label="Reference library"/> + <param name="nt_spec" type="select" label="Nucleotide specificity"> + <option value="AC">AC</option> + <option value="ATCG">AUCG</option> + </param> + <param name="flag_in" type="boolean" checked="true" truevalue = "1" falsevalue = "0" label="Normalization is performed if checked"/> + <param name="threshold" type="float" value = "7" optional = "true" label="Value to cap the reactivities"/> + </inputs> + <outputs> + <data name="output" format="txt"/> + </outputs> + + <help> + + +**TIPS**: + +----- + +**Input**: + +* 1. RTSC files (Output of II) for (+) and (-) library +* 2. Reference file (fasta) used to map the reads +* 3. Nucleotide Specificity (Type of nucleotide to have reactivity, e.g. AC for DMS and ACTG for SHAPE) +* [Optional]: +* 1. A threshold to cap the structural reactivities. {Default: 7} +* 2. Flag that determines whether to perform 2%-8% normalization {Default: yes} + +----- + +**Output**: + +A text file with structural reactivity for each nucleotide (Reactivity file) + + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/reactivity_cal/read_file.py Mon Oct 20 14:31:27 2014 -0400 @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + + + +def read_t_file(in_file): + f = open(in_file); + result = []; + for aline in f.readlines(): + temp = []; + tline = aline.strip(); + tl = tline.split('\t'); + for i in range(0, len(tl)): + temp.append(tl[i].strip()); + result.append(temp); + f.close(); + return result; + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/reactivity_cal/separate_rna.py Mon Oct 20 14:31:27 2014 -0400 @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +from parse_dis_pac import * + + +dist_file = sys.argv[1] +cdna_file = sys.argv[2] +rrna_file = sys.argv[3] + +dist = parse_dist(dist_file) +dist = dist[1] +hc = file(cdna_file, 'w') +hr = file(rrna_file, 'w') + +for t in dist: + if t.find('AT') != -1: + hc.write(t) + hc.write('\n') + for i in range(len(dist[t])-1): + hc.write(dist[t][i]) + hc.write('\t') + i = i+1 + hc.write(dist[t][i]) + hc.write('\n') + else: + hr.write(t) + hr.write('\n') + for i in range(len(dist[t])-1): + hr.write(dist[t][i]) + hr.write('\t') + i = i+1 + hr.write(dist[t][i]) + hr.write('\n') + +hc.close() +hr.close() + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/reactivity_cal/tool_dependencies.xml Mon Oct 20 14:31:27 2014 -0400 @@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="biopython" version="1.61"> + <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="numpy" version="1.7"> + <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
--- a/get_reads/read_file.py Mon Oct 20 02:12:08 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys - - - -def read_t_file(in_file): - f = open(in_file); - result = []; - for aline in f.readlines(): - temp = []; - tline = aline.strip(); - tl = tline.split('\t'); - for i in range(0, len(tl)): - temp.append(tl[i].strip()); - result.append(temp); - f.close(); - return result; - -
--- a/get_reads/tool_dependencies.xml Mon Oct 20 02:12:08 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="biopython" version="1.61"> - <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="numpy" version="1.7"> - <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="samtools" version="0.1.18"> - <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>