# HG changeset patch # User tyty # Date 1429035379 14400 # Node ID a6e3505227b59ec5f8cc425af77ae777c7bcbbcb # Parent e269e4c6818ea2f3bc53fb10563b50952a20d6ee Uploaded diff -r e269e4c6818e -r a6e3505227b5 get_reads/._.DS_Store Binary file get_reads/._.DS_Store has changed diff -r e269e4c6818e -r a6e3505227b5 get_reads/._get_read.py Binary file get_reads/._get_read.py has changed diff -r e269e4c6818e -r a6e3505227b5 get_reads/._get_read.xml Binary file get_reads/._get_read.xml has changed diff -r e269e4c6818e -r a6e3505227b5 get_reads/._read_file.py Binary file get_reads/._read_file.py has changed diff -r e269e4c6818e -r a6e3505227b5 get_reads/get_read.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/get_read.py Tue Apr 14 14:16:19 2015 -0400 @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +from Bio import SeqIO +import os +from read_file import * +import random +import string + +fasta_file = sys.argv[1] +map_file = sys.argv[2] +result_file = sys.argv[3] + +syspathrs = os.getcwd() + +os.system("samtools view -F 0xfff "+map_file+"|cut -f 3,4 > "+syspathrs+"map_info.txt") + +fasta_sequences = SeqIO.parse(open(fasta_file),'fasta'); +length_seq = {}; +for seq in fasta_sequences: + nuc = seq.id; + length_seq[nuc] = len(seq.seq.tostring()); + + + +mapping = {} +transcripts = [] + +f = open(syspathrs+"map_info.txt"); +for aline in f.readlines(): + tline = aline.strip(); + tl = tline.split('\t'); + if tl[0].strip() not in transcripts: + transcripts.append(tl[0].strip()); + mapping[tl[0].strip()] = []; + + mapping[tl[0].strip()].append(tl[1].strip()); + +distribution = {}; +coverage = {}; +for transcript in length_seq: + distribution[transcript] = []; + for i in range(0, length_seq[transcript]): + distribution[transcript].append(0); + sum_count = float(0); + if transcript in mapping: + for j in range(0, len(mapping[transcript])): + index = mapping[transcript][j]; + #count = reads[mapping[transcript][j][0]]; + sum_count = sum_count + 1; + distribution[transcript][int(index)-1] = distribution[transcript][int(index)-1] + 1; + coverage[transcript] = float(sum_count)/float(length_seq[transcript]); + else: + coverage[transcript] = 0 + + + + + +h = file(result_file, 'w') +for transcript in length_seq: + h.write(transcript); + h.write('\n') + for i in range(0, length_seq[transcript]): + h.write(str(distribution[transcript][i])) + h.write('\t') + h.write('\n') + h.write('\n') + +#os.system("rm -r "+syspathrs) + + + +f.close(); +h.close() + + + + diff -r e269e4c6818e -r a6e3505227b5 get_reads/get_read.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/get_read.xml Tue Apr 14 14:16:19 2015 -0400 @@ -0,0 +1,46 @@ + + derives the reverse transcriptase (RT) stop count on each nucleotide from a mapped file provided by the Iterative Mapping module + get_read.py $lib_file $map_file $output + + biopython + numpy + samtools + + + + + + + + + + + + + + + + + + +**Function** + +Get RT Stop Counts derives the RT stop counts on each nucleotide of each transcript in the reference transcriptome based on a mapped file (.bam), typically the output from the Iterative Mapping module. + +----- + +**Input**: + +* 1. A mapped (.bam) file from Bowtie (or any other mapping program) +* 2. Reference library sequences (fasta) used to map the reads to + +----- + +**Output**: + +A text file with reverse transcription stop counts mapped to each nucleotide (RTSC file) + + + + + diff -r e269e4c6818e -r a6e3505227b5 get_reads/read_file.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_reads/read_file.py Tue Apr 14 14:16:19 2015 -0400 @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + + + +def read_t_file(in_file): + f = open(in_file); + result = []; + for aline in f.readlines(): + temp = []; + tline = aline.strip(); + tl = tline.split('\t'); + for i in range(0, len(tl)): + temp.append(tl[i].strip()); + result.append(temp); + f.close(); + return result; + + diff -r e269e4c6818e -r a6e3505227b5 get_reads/read_file.pyc Binary file get_reads/read_file.pyc has changed