annotate get_reads/get_read.py @ 105:abe41dc167f1 draft

Uploaded
author tyty
date Thu, 19 Mar 2015 17:43:24 -0400
parents 58df1060fea1
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
62
58df1060fea1 Uploaded
tyty
parents:
diff changeset
1 #!/usr/bin/env python
58df1060fea1 Uploaded
tyty
parents:
diff changeset
2 # -*- coding: utf-8 -*-
58df1060fea1 Uploaded
tyty
parents:
diff changeset
3
58df1060fea1 Uploaded
tyty
parents:
diff changeset
4 import sys
58df1060fea1 Uploaded
tyty
parents:
diff changeset
5 from Bio import SeqIO
58df1060fea1 Uploaded
tyty
parents:
diff changeset
6 import os
58df1060fea1 Uploaded
tyty
parents:
diff changeset
7 from read_file import *
58df1060fea1 Uploaded
tyty
parents:
diff changeset
8 import random
58df1060fea1 Uploaded
tyty
parents:
diff changeset
9 import string
58df1060fea1 Uploaded
tyty
parents:
diff changeset
10
58df1060fea1 Uploaded
tyty
parents:
diff changeset
11 fasta_file = sys.argv[1]
58df1060fea1 Uploaded
tyty
parents:
diff changeset
12 map_file = sys.argv[2]
58df1060fea1 Uploaded
tyty
parents:
diff changeset
13 result_file = sys.argv[3]
58df1060fea1 Uploaded
tyty
parents:
diff changeset
14
58df1060fea1 Uploaded
tyty
parents:
diff changeset
15 syspathrs = os.getcwd()
58df1060fea1 Uploaded
tyty
parents:
diff changeset
16
58df1060fea1 Uploaded
tyty
parents:
diff changeset
17 os.system("samtools view -F 0xfff "+map_file+"|cut -f 3,4 > "+syspathrs+"map_info.txt")
58df1060fea1 Uploaded
tyty
parents:
diff changeset
18
58df1060fea1 Uploaded
tyty
parents:
diff changeset
19 fasta_sequences = SeqIO.parse(open(fasta_file),'fasta');
58df1060fea1 Uploaded
tyty
parents:
diff changeset
20 length_seq = {};
58df1060fea1 Uploaded
tyty
parents:
diff changeset
21 for seq in fasta_sequences:
58df1060fea1 Uploaded
tyty
parents:
diff changeset
22 nuc = seq.id;
58df1060fea1 Uploaded
tyty
parents:
diff changeset
23 length_seq[nuc] = len(seq.seq.tostring());
58df1060fea1 Uploaded
tyty
parents:
diff changeset
24
58df1060fea1 Uploaded
tyty
parents:
diff changeset
25
58df1060fea1 Uploaded
tyty
parents:
diff changeset
26
58df1060fea1 Uploaded
tyty
parents:
diff changeset
27 mapping = {}
58df1060fea1 Uploaded
tyty
parents:
diff changeset
28 transcripts = []
58df1060fea1 Uploaded
tyty
parents:
diff changeset
29
58df1060fea1 Uploaded
tyty
parents:
diff changeset
30 f = open(syspathrs+"map_info.txt");
58df1060fea1 Uploaded
tyty
parents:
diff changeset
31 for aline in f.readlines():
58df1060fea1 Uploaded
tyty
parents:
diff changeset
32 tline = aline.strip();
58df1060fea1 Uploaded
tyty
parents:
diff changeset
33 tl = tline.split('\t');
58df1060fea1 Uploaded
tyty
parents:
diff changeset
34 if tl[0].strip() not in transcripts:
58df1060fea1 Uploaded
tyty
parents:
diff changeset
35 transcripts.append(tl[0].strip());
58df1060fea1 Uploaded
tyty
parents:
diff changeset
36 mapping[tl[0].strip()] = [];
58df1060fea1 Uploaded
tyty
parents:
diff changeset
37
58df1060fea1 Uploaded
tyty
parents:
diff changeset
38 mapping[tl[0].strip()].append(tl[1].strip());
58df1060fea1 Uploaded
tyty
parents:
diff changeset
39
58df1060fea1 Uploaded
tyty
parents:
diff changeset
40 distribution = {};
58df1060fea1 Uploaded
tyty
parents:
diff changeset
41 coverage = {};
58df1060fea1 Uploaded
tyty
parents:
diff changeset
42 for transcript in length_seq:
58df1060fea1 Uploaded
tyty
parents:
diff changeset
43 distribution[transcript] = [];
58df1060fea1 Uploaded
tyty
parents:
diff changeset
44 for i in range(0, length_seq[transcript]):
58df1060fea1 Uploaded
tyty
parents:
diff changeset
45 distribution[transcript].append(0);
58df1060fea1 Uploaded
tyty
parents:
diff changeset
46 sum_count = float(0);
58df1060fea1 Uploaded
tyty
parents:
diff changeset
47 if transcript in mapping:
58df1060fea1 Uploaded
tyty
parents:
diff changeset
48 for j in range(0, len(mapping[transcript])):
58df1060fea1 Uploaded
tyty
parents:
diff changeset
49 index = mapping[transcript][j];
58df1060fea1 Uploaded
tyty
parents:
diff changeset
50 #count = reads[mapping[transcript][j][0]];
58df1060fea1 Uploaded
tyty
parents:
diff changeset
51 sum_count = sum_count + 1;
58df1060fea1 Uploaded
tyty
parents:
diff changeset
52 distribution[transcript][int(index)-1] = distribution[transcript][int(index)-1] + 1;
58df1060fea1 Uploaded
tyty
parents:
diff changeset
53 coverage[transcript] = float(sum_count)/float(length_seq[transcript]);
58df1060fea1 Uploaded
tyty
parents:
diff changeset
54 else:
58df1060fea1 Uploaded
tyty
parents:
diff changeset
55 coverage[transcript] = 0
58df1060fea1 Uploaded
tyty
parents:
diff changeset
56
58df1060fea1 Uploaded
tyty
parents:
diff changeset
57
58df1060fea1 Uploaded
tyty
parents:
diff changeset
58
58df1060fea1 Uploaded
tyty
parents:
diff changeset
59
58df1060fea1 Uploaded
tyty
parents:
diff changeset
60
58df1060fea1 Uploaded
tyty
parents:
diff changeset
61 h = file(result_file, 'w')
58df1060fea1 Uploaded
tyty
parents:
diff changeset
62 for transcript in length_seq:
58df1060fea1 Uploaded
tyty
parents:
diff changeset
63 h.write(transcript);
58df1060fea1 Uploaded
tyty
parents:
diff changeset
64 h.write('\n')
58df1060fea1 Uploaded
tyty
parents:
diff changeset
65 for i in range(0, length_seq[transcript]):
58df1060fea1 Uploaded
tyty
parents:
diff changeset
66 h.write(str(distribution[transcript][i]))
58df1060fea1 Uploaded
tyty
parents:
diff changeset
67 h.write('\t')
58df1060fea1 Uploaded
tyty
parents:
diff changeset
68 h.write('\n')
58df1060fea1 Uploaded
tyty
parents:
diff changeset
69 h.write('\n')
58df1060fea1 Uploaded
tyty
parents:
diff changeset
70
58df1060fea1 Uploaded
tyty
parents:
diff changeset
71 #os.system("rm -r "+syspathrs)
58df1060fea1 Uploaded
tyty
parents:
diff changeset
72
58df1060fea1 Uploaded
tyty
parents:
diff changeset
73
58df1060fea1 Uploaded
tyty
parents:
diff changeset
74
58df1060fea1 Uploaded
tyty
parents:
diff changeset
75 f.close();
58df1060fea1 Uploaded
tyty
parents:
diff changeset
76 h.close()
58df1060fea1 Uploaded
tyty
parents:
diff changeset
77
58df1060fea1 Uploaded
tyty
parents:
diff changeset
78
58df1060fea1 Uploaded
tyty
parents:
diff changeset
79
58df1060fea1 Uploaded
tyty
parents:
diff changeset
80