annotate bed12.py @ 0:d1d0ee366702 draft default tip

Uploaded first version
author brenninc
date Wed, 11 May 2016 04:53:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
1 """
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
2 .. module:: bed12
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
3 :platform: Unix
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
4 :synopsis: Defines a set a generic function to parse and process bed12 files.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
5
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
6 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com>
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
7
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
8 .. source: https://github.com/mmendez12/umicount
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
9
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
10 """
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
11
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
12 __author__ = 'mickael'
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
13
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
14
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
15 import operator
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
16 import itertools
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
17
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
18 def get_chrom(read):
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
19 """Get chromosome from a bed12 line.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
20
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
21 Args:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
22 read: A list of twelve elements where each element refers to a field in the BED format.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
23
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
24 Returns:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
25 The chromosome name
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
26
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
27 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
28 >>> get_chrom(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
29 'chrX'
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
30 """
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
31 return read[0]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
32
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
33
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
34 def get_start(read):
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
35 """Get start position from a bed12 line.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
36
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
37 Args:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
38 read: A list of twelve elements where each element refers to a field in the BED format.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
39
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
40 Returns:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
41 An integer representing the start position of the read.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
42
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
43 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
44 >>> get_start(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
45 100
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
46 """
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
47 return int(read[1])
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
48
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
49
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
50 def get_end(read):
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
51 """Get end position from a bed12 line.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
52
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
53 Args:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
54 read: A list of twelve elements where each element refers to a field in the BED format.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
55
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
56 Returns:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
57 An integer representing the end position of the read.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
58
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
59 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
60 >>> get_end(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
61 200
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
62 """
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
63 return int(read[2])
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
64
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
65
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
66 def get_strand(read):
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
67 """Get strand from a bed12 line.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
68
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
69 Args:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
70 read: A list of twelve elements where each element refers to a field in the BED format.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
71
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
72 Returns:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
73 A single char representing the strand of a read
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
74
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
75 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
76 >>> get_strand(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
77 '+'
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
78 """
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
79 return read[5]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
80
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
81 def get_tss(read):
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
82 """Get Transcription Start Site (TSS) from a bed12 line.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
83
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
84 Args:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
85 read: A list of twelve elements where each element refers to a field in the BED format.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
86
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
87 Returns:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
88 The start position as an integer if the read is on the plus strand.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
89 The end position as an integer if the read is on the minus strand.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
90
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
91 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
92 >>> get_tss(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
93 100
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
94 >>> read = ['chrX', '100', '200', 'toto', '12', '-', '100', '110', '255,0,0', '2', '21,25', '0,75']
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
95 >>> get_tss(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
96 200
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
97 """
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
98 strand = get_strand(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
99
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
100 if strand == '+':
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
101 return get_start(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
102 else:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
103 return get_end(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
104
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
105
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
106 def blocks_to_absolute_start_end(read):
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
107 """Calculate the absolute start and end of the blocks from a bed12 line.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
108
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
109 Args:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
110 read: A list of twelve elements where each element refers to a field in the BED format.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
111
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
112 Returns:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
113 A list of tuple where each tuple contains the absolute start and end coordinates of a block.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
114
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
115 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
116 >>> blocks_to_absolute_start_end(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
117 [(100, 121), (175, 200)]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
118 """
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
119 read_start = get_start(read)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
120
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
121 block_starts = [read_start + int(start) for start in read[11].split(',') if start]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
122 block_sizes = [int(size) for size in read[10].split(',') if size]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
123
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
124 block_starts_sizes = zip(block_starts, block_sizes)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
125
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
126 return [(bstart, bstart + bsize) for bstart, bsize in block_starts_sizes]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
127
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
128
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
129 def merge_overlapping_blocks(reads):
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
130 """Merge blocks if they overlap.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
131
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
132 Args:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
133 reads: A list of read in the BED12 format.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
134
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
135 Returns:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
136 Two lists where the first list contains the blocks sizes and the second the blocks starts.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
137 Values in the lists are integer.
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
138
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
139 >>> reads = []
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
140 >>> reads.append(['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75'])
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
141 >>> reads.append(['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '3', '10,10,25', '0,15,75'])
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
142 >>> merge_overlapping_blocks(reads)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
143 ([25, 25], [0, 75])
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
144 """
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
145
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
146 blocks_list = [blocks_to_absolute_start_end(read) for read in reads]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
147
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
148 #flatten
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
149 blocks = itertools.chain.from_iterable(blocks_list)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
150
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
151 final_blocks = []
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
152
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
153 blocks = sorted(blocks, key = operator.itemgetter(0, 1))
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
154 known_block_start, known_block_end = blocks[0]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
155
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
156 for block_start, block_end in blocks[1:]:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
157 if block_start <= known_block_end:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
158 known_block_end = max(block_end, known_block_end)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
159 else:
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
160 final_blocks.append((known_block_start, known_block_end))
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
161 known_block_start, known_block_end = (block_start, block_end)
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
162
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
163 final_blocks.append((known_block_start, known_block_end))
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
164
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
165 absolute_block_start = final_blocks[0][0]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
166
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
167 bsizes = [end - start for start, end in final_blocks]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
168 bstarts = [start - absolute_block_start for start, end in final_blocks]
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
169
d1d0ee366702 Uploaded first version
brenninc
parents:
diff changeset
170 return bsizes, bstarts