Mercurial > repos > tyty > structurefold
changeset 7:b10c6790f01e draft
Uploaded
author | tyty |
---|---|
date | Mon, 15 Sep 2014 14:55:21 -0400 |
parents | 1a909794e94d |
children | 74ae6f6a3383 |
files | get_reads/.DS_Store get_reads/get_read.py get_reads/get_read.xml get_reads/read_file.py get_reads/test.bam get_reads/tool_dependencies.xml reactivity_cal/parse_dis_react.py reactivity_cal/react_cal.py reactivity_cal/react_norm_function.py reactivity_cal/reactivity_calculation.xml reactivity_cal/read_file.py reactivity_cal/separate_rna.py reactivity_cal/tool_dependencies.xml |
diffstat | 13 files changed, 154 insertions(+), 436 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/get_read.py Mon Sep 15 14:55:21 2014 -0400 @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +#from galaxy.tools.read_file import * +from Bio import SeqIO +import os +from read_file import * + +fasta_file = sys.argv[1] +map_file = sys.argv[2] +result_file = sys.argv[3] + +os.system("samtools view -F 0xfff "+map_file+"|cut -f 3,4 > map_info.txt") + +fasta_sequences = SeqIO.parse(open(fasta_file),'fasta'); +length_seq = {}; +for seq in fasta_sequences: + nuc = seq.id; + length_seq[nuc] = len(seq.seq.tostring()); + + + +mapping = {} +transcripts = [] + +f = open("map_info.txt"); +for aline in f.readlines(): + tline = aline.strip(); + tl = tline.split('\t'); + if tl[0].strip() not in transcripts: + transcripts.append(tl[0].strip()); + mapping[tl[0].strip()] = []; + + mapping[tl[0].strip()].append(tl[1].strip()); + +distribution = {}; +coverage = {}; +for transcript in length_seq: + distribution[transcript] = []; + for i in range(0, length_seq[transcript]): + distribution[transcript].append(0); + sum_count = float(0); + if transcript in mapping: + for j in range(0, len(mapping[transcript])): + index = mapping[transcript][j]; + #count = reads[mapping[transcript][j][0]]; + sum_count = sum_count + 1; + distribution[transcript][int(index)-1] = distribution[transcript][int(index)-1] + 1; + coverage[transcript] = float(sum_count)/float(length_seq[transcript]); + else: + coverage[transcript] = 0 + + + + + +h = file(result_file, 'w') +for transcript in length_seq: + h.write(transcript); + h.write('\n') + for i in range(0, length_seq[transcript]): + h.write(str(distribution[transcript][i])) + h.write('\t') + h.write('\n') + h.write('\n') + + + + + +f.close(); +h.close() + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/get_read.xml Mon Sep 15 14:55:21 2014 -0400 @@ -0,0 +1,44 @@ +<tool id="get_read_pipeline" name="Get RT stop counts" version="1.0"> + <description></description> + <command interpreter="python">get_read.py $lib_file $map_file $output </command> + <requirements> + <requirement type="package" version="1.61">biopython</requirement> + <requirement type="package" version="1.7">numpy</requirement> + <requirement type="package" version="0.1.18">samtools</requirement> + </requirements> + <inputs> + <param name="lib_file" type="data" format="fasta" label="Library file (fasta)"/> + <param name="map_file" type="data" format="bam" label="Mapped file"/> + </inputs> + <outputs> + <data name="output" format="txt"/> + </outputs> + <tests> + <test> + <param name="lib_file" value="test.bam" /> + <param name="map_file" value="com_rna.txt" /> + <output name="output" file="get_RT_stop_test.out" /> + + </test> + </tests> + + <help> + + +**TIPS**: + +----- + +**Input** +1. A mapped (bam) file from Bowtie (or any mapping program) +2. Reference library sequences (fasta) used to map the reads + +----- + +**Output**: +A text file with reverse transcription stop counts mapped to each nucleotide (RTSC file) + + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/read_file.py Mon Sep 15 14:55:21 2014 -0400 @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + + + +def read_t_file(in_file): + f = open(in_file); + result = []; + for aline in f.readlines(): + temp = []; + tline = aline.strip(); + tl = tline.split('\t'); + for i in range(0, len(tl)): + temp.append(tl[i].strip()); + result.append(temp); + f.close(); + return result; + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/tool_dependencies.xml Mon Sep 15 14:55:21 2014 -0400 @@ -0,0 +1,12 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="biopython" version="1.61"> + <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="numpy" version="1.7"> + <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="samtools" version="0.1.18"> + <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
--- a/reactivity_cal/parse_dis_react.py Mon Sep 15 14:54:59 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import sys - -def parse_dist(in_file): - result = [] - distribution = {} - name = [] - f = open(in_file) - flag = 0 - for aline in f.readlines(): - line = aline.strip() - dis = line.strip() - dist = dis.split('\t') - if len(dist) > 0: - if len(dist) == 1: - if dist[0].strip().find('coverage')==-1: - if flag == 0: - name.append(line) - flag = 1 - t_name = line - else: - distribution[t_name] = 'null' - name.append(line) - flag = 1 - t_name = line - else: - distri = [] - for i in range(0, len(dist)): - distri.append(dist[i].strip()) - distribution[t_name] = distri - flag = 0 - result.append(name) - result.append(distribution) - f.close() - return result - - - - - - - - - - - - - - -
--- a/reactivity_cal/react_cal.py Mon Sep 15 14:54:59 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,151 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import sys -from Bio import SeqIO -import math -from parse_dis_react import * -from react_norm_function import * -import os - - -dist_file1 = sys.argv[1] #plus library -dist_file2 = sys.argv[2] #minus library -seq_file = sys.argv[3] -nt_spec = sys.argv[4] -flag_in = sys.argv[5] -threshold = sys.argv[6] -output_file = sys.argv[7] - - -distri_p = parse_dist(dist_file1) -distri_m = parse_dist(dist_file2) -threshold = float(threshold) - -print(flag_in) - -ospath = os.path.realpath(sys.argv[0]) -ost = ospath.split('/') -syspath = "" -for i in range(len(ost)-1): - syspath = syspath+ost[i].strip() - syspath = syspath+'/' - -h = file(syspath+"react.txt",'w') -flag_in = int(flag_in) - -seqs = SeqIO.parse(open(seq_file),'fasta'); -nt_s = set() -for i in range(len(nt_spec)): - nt_s.add(nt_spec[i]) - -flag = 0 -trans = [] -distri_p = distri_p[1] -distri_m = distri_m[1] - -#thres = int(threshold) - - -transcripts = {} -for seq in seqs: - n = seq.id - trans.append(n) - transcripts[n] = seq.seq.tostring() - - -#print(distri_p) - - -for i in range(0, len(trans)): - h.write(trans[i]) - h.write('\n') - if (trans[i].find('AT1G29930')==-1) and (trans[i].find('At1g29930')==-1): - for j in range(len(distri_p[trans[i]])): - distri_p[trans[i]][j] = math.log((int(distri_p[trans[i]][j])+1),math.e) - for j in range(len(distri_m[trans[i]])): - distri_m[trans[i]][j] = math.log((int(distri_m[trans[i]][j])+1),math.e) - s_p = sum(distri_p[trans[i]]) - s_m = sum(distri_m[trans[i]]) - length = len(distri_p[trans[i]]) - if s_p!= 0 and s_m!= 0: - r = [] - for j in range(0, len(distri_p[trans[i]])): - f_p = (float(distri_p[trans[i]][j]))/float(s_p)*length - f_m = (float(distri_m[trans[i]][j]))/float(s_m)*length - raw_react = f_p-f_m - r.append(max(0, raw_react)) - else: - for j in range(len(distri_p[trans[i]])): - distri_p[trans[i]][j] = int(distri_p[trans[i]][j]) - for j in range(len(distri_m[trans[i]])): - distri_m[trans[i]][j] = int(distri_m[trans[i]][j]) - s_p = sum(distri_p[trans[i]]) - s_m = sum(distri_m[trans[i]]) - if s_p!= 0 and s_m!= 0: - r = [] - for j in range(0, len(distri_p[trans[i]])): - f_p = float(distri_p[trans[i]][j])/float(s_p) - f_m = float(distri_m[trans[i]][j])/float(s_m) - r.append((max(0,(f_p-f_m)))*100) - - if s_p!= 0 and s_m!= 0: - for k in range(1,(len(r)-1)): - if transcripts[trans[i]][k-1] in nt_s: - h.write(str(r[k])) - h.write('\t') - else: - h.write('NA') - h.write('\t') - k = k+1 - if transcripts[trans[i]][k-1] in nt_s: - h.write(str(r[k])) - h.write('\n') - else: - h.write('NA') - h.write('\n') - - -h.close() - -if flag_in: - react_norm((syspath+"react.txt"),output_file, threshold) -else: - h_o = file(output_file, 'w') - f_i = open(syspath+"react.txt") - for aline in f_i.readlines(): - h_o.write(aline.strip()) - h_o.write('\n') -os.system("rm -f "+syspath+"react.txt") - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
--- a/reactivity_cal/react_norm_function.py Mon Sep 15 14:54:59 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,114 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import sys -from Bio import SeqIO -import math -from parse_dis_react import * - -def cap(a,value): - if a>=value: - return value - else: - return a - -def react_norm(react_file, result_file, capped_value): - print("Normalizing.....") - react1 = parse_dist(react_file) - react = react1[1] - h = file(result_file, 'w') - - capped = int(capped_value) - - all_react = [] - - - for t in react: - if react[t]!='null': - for i in range(len(react[t])): - if react[t][i]!='NA': - all_react.append(float(react[t][i])) -# except: -# print(react[t][i]) -# print(t) -# print(i) - - all_react.sort(reverse = True) - #print((all_react)) - #print(all_react[int(len(all_react)*0.02)]) - #print(all_react[int(len(all_react)*0.03)]) - #print(all_react[int(len(all_react)*0.025)]) - #print(all_react[int(len(all_react)*0.04)]) - #print(all_react[int(len(all_react)*0.05)]) - ''' - mean = sum(all_react)/len(all_react) - print(mean) - temp = 0 - - for i in range(len(all_react)): - temp = temp+all_react[i]*all_react[i] - temp = temp/len(all_react) - sd = math.sqrt(temp-mean*mean) - ''' - eight = all_react[int(len(all_react)*0.02):int(len(all_react)*0.1)] - meight = sum(eight)/len(eight) - - for t in react: - h.write(t) - h.write('\n') - if react[t]!='null': - if (t.find('AT1G29930')==-1) and (t.find('At1g29930')==-1): - for i in range((len(react[t])-1)): - if react[t][i]!='NA': - h.write(str(cap((float(react[t][i])/meight),capped))) - else: - h.write('NA') - h.write('\t') - if react[t][i+1]!='NA': - h.write(str(cap((float(react[t][i+1])/meight),capped))) - else: - h.write('NA') - h.write('\n') - else: - for i in range((len(react[t])-1)): - if react[t][i]!='NA': - h.write(str(float(react[t][i])*2.6)) - else: - h.write('NA') - h.write('\t') - if react[t][i+1]!='NA': - h.write(str(float(react[t][i])*2.6)) - else: - h.write('NA') - h.write('\n') - - - - h.close() - - - - - - - - - - - - - - - - - - - - - - - - - - - -
--- a/reactivity_cal/reactivity_calculation.xml Mon Sep 15 14:54:59 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,48 +0,0 @@ -<tool id="react_cal_pipeline" name="Reactivity calculation" version="1.0"> - <description></description> - <command interpreter="python">react_cal.py $dist_file1 $dist_file2 $seq_file $nt_spec $flag_in $threshold $output </command> - <requirements> - <requirement type="package" version="1.61">biopython</requirement> - <requirement type="package" version="1.7">numpy</requirement> - </requirements> - <inputs> - <param name="dist_file1" type="data" format="txt" label="RTSC file for (+) library"/> - <param name="dist_file2" type="data" format="txt" label="RTSC file for (-) library"/> - <param name="seq_file" type="data" format="fasta" label="Reference library"/> - <param name="nt_spec" type="select" label="Nucleotide specificity"> - <option value="AC">AC</option> - <option value="ATCG">AUCG</option> - </param> - <param name="flag_in" type="boolean" checked="true" truevalue = "1" falsevalue = "0" label="Normalization is performed if checked"/> - <param name="threshold" type="float" value = "7" optional = "true" label="Value to cap the reactivities"/> - </inputs> - <outputs> - <data name="output" format="txt"/> - </outputs> - - <help> - - -**TIPS**: - ------ - -**Input**: - -* 1. RTSC files (Output of II) for (+) and (-) library -* 2. Reference file (fasta) used to map the reads -* 3. Nucleotide Specificity (Type of nucleotide to have reactivity, e.g. AC for DMS and ACTG for SHAPE) -* [Optional]: -* 1. A threshold to cap the structural reactivities. {Default: 7} -* 2. Flag that determines whether to perform 2%-8% normalization {Default: yes} - ------ - -**Output**: - -A text file with structural reactivity for each nucleotide (Reactivity file) - - - - </help> -</tool>
--- a/reactivity_cal/read_file.py Mon Sep 15 14:54:59 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys - - - -def read_t_file(in_file): - f = open(in_file); - result = []; - for aline in f.readlines(): - temp = []; - tline = aline.strip(); - tl = tline.split('\t'); - for i in range(0, len(tl)): - temp.append(tl[i].strip()); - result.append(temp); - f.close(); - return result; - -
--- a/reactivity_cal/separate_rna.py Mon Sep 15 14:54:59 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -from parse_dis_pac import * - - -dist_file = sys.argv[1] -cdna_file = sys.argv[2] -rrna_file = sys.argv[3] - -dist = parse_dist(dist_file) -dist = dist[1] -hc = file(cdna_file, 'w') -hr = file(rrna_file, 'w') - -for t in dist: - if t.find('AT') != -1: - hc.write(t) - hc.write('\n') - for i in range(len(dist[t])-1): - hc.write(dist[t][i]) - hc.write('\t') - i = i+1 - hc.write(dist[t][i]) - hc.write('\n') - else: - hr.write(t) - hr.write('\n') - for i in range(len(dist[t])-1): - hr.write(dist[t][i]) - hr.write('\t') - i = i+1 - hr.write(dist[t][i]) - hr.write('\n') - -hc.close() -hr.close() - - - -
--- a/reactivity_cal/tool_dependencies.xml Mon Sep 15 14:54:59 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="biopython" version="1.61"> - <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="numpy" version="1.7"> - <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>