0
|
1 """
|
|
2 .. module:: bed12
|
|
3 :platform: Unix
|
|
4 :synopsis: Defines a set a generic function to parse and process bed12 files.
|
|
5
|
|
6 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com>
|
|
7
|
|
8 .. source: https://github.com/mmendez12/umicount
|
|
9
|
|
10 """
|
|
11
|
|
12 __author__ = 'mickael'
|
|
13
|
|
14
|
|
15 import operator
|
|
16 import itertools
|
|
17
|
|
18 def get_chrom(read):
|
|
19 """Get chromosome from a bed12 line.
|
|
20
|
|
21 Args:
|
|
22 read: A list of twelve elements where each element refers to a field in the BED format.
|
|
23
|
|
24 Returns:
|
|
25 The chromosome name
|
|
26
|
|
27 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
|
|
28 >>> get_chrom(read)
|
|
29 'chrX'
|
|
30 """
|
|
31 return read[0]
|
|
32
|
|
33
|
|
34 def get_start(read):
|
|
35 """Get start position from a bed12 line.
|
|
36
|
|
37 Args:
|
|
38 read: A list of twelve elements where each element refers to a field in the BED format.
|
|
39
|
|
40 Returns:
|
|
41 An integer representing the start position of the read.
|
|
42
|
|
43 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
|
|
44 >>> get_start(read)
|
|
45 100
|
|
46 """
|
|
47 return int(read[1])
|
|
48
|
|
49
|
|
50 def get_end(read):
|
|
51 """Get end position from a bed12 line.
|
|
52
|
|
53 Args:
|
|
54 read: A list of twelve elements where each element refers to a field in the BED format.
|
|
55
|
|
56 Returns:
|
|
57 An integer representing the end position of the read.
|
|
58
|
|
59 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
|
|
60 >>> get_end(read)
|
|
61 200
|
|
62 """
|
|
63 return int(read[2])
|
|
64
|
|
65
|
|
66 def get_strand(read):
|
|
67 """Get strand from a bed12 line.
|
|
68
|
|
69 Args:
|
|
70 read: A list of twelve elements where each element refers to a field in the BED format.
|
|
71
|
|
72 Returns:
|
|
73 A single char representing the strand of a read
|
|
74
|
|
75 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
|
|
76 >>> get_strand(read)
|
|
77 '+'
|
|
78 """
|
|
79 return read[5]
|
|
80
|
|
81 def get_tss(read):
|
|
82 """Get Transcription Start Site (TSS) from a bed12 line.
|
|
83
|
|
84 Args:
|
|
85 read: A list of twelve elements where each element refers to a field in the BED format.
|
|
86
|
|
87 Returns:
|
|
88 The start position as an integer if the read is on the plus strand.
|
|
89 The end position as an integer if the read is on the minus strand.
|
|
90
|
|
91 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
|
|
92 >>> get_tss(read)
|
|
93 100
|
|
94 >>> read = ['chrX', '100', '200', 'toto', '12', '-', '100', '110', '255,0,0', '2', '21,25', '0,75']
|
|
95 >>> get_tss(read)
|
|
96 200
|
|
97 """
|
|
98 strand = get_strand(read)
|
|
99
|
|
100 if strand == '+':
|
|
101 return get_start(read)
|
|
102 else:
|
|
103 return get_end(read)
|
|
104
|
|
105
|
|
106 def blocks_to_absolute_start_end(read):
|
|
107 """Calculate the absolute start and end of the blocks from a bed12 line.
|
|
108
|
|
109 Args:
|
|
110 read: A list of twelve elements where each element refers to a field in the BED format.
|
|
111
|
|
112 Returns:
|
|
113 A list of tuple where each tuple contains the absolute start and end coordinates of a block.
|
|
114
|
|
115 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
|
|
116 >>> blocks_to_absolute_start_end(read)
|
|
117 [(100, 121), (175, 200)]
|
|
118 """
|
|
119 read_start = get_start(read)
|
|
120
|
|
121 block_starts = [read_start + int(start) for start in read[11].split(',') if start]
|
|
122 block_sizes = [int(size) for size in read[10].split(',') if size]
|
|
123
|
|
124 block_starts_sizes = zip(block_starts, block_sizes)
|
|
125
|
|
126 return [(bstart, bstart + bsize) for bstart, bsize in block_starts_sizes]
|
|
127
|
|
128
|
|
129 def merge_overlapping_blocks(reads):
|
|
130 """Merge blocks if they overlap.
|
|
131
|
|
132 Args:
|
|
133 reads: A list of read in the BED12 format.
|
|
134
|
|
135 Returns:
|
|
136 Two lists where the first list contains the blocks sizes and the second the blocks starts.
|
|
137 Values in the lists are integer.
|
|
138
|
|
139 >>> reads = []
|
|
140 >>> reads.append(['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75'])
|
|
141 >>> reads.append(['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '3', '10,10,25', '0,15,75'])
|
|
142 >>> merge_overlapping_blocks(reads)
|
|
143 ([25, 25], [0, 75])
|
|
144 """
|
|
145
|
|
146 blocks_list = [blocks_to_absolute_start_end(read) for read in reads]
|
|
147
|
|
148 #flatten
|
|
149 blocks = itertools.chain.from_iterable(blocks_list)
|
|
150
|
|
151 final_blocks = []
|
|
152
|
|
153 blocks = sorted(blocks, key = operator.itemgetter(0, 1))
|
|
154 known_block_start, known_block_end = blocks[0]
|
|
155
|
|
156 for block_start, block_end in blocks[1:]:
|
|
157 if block_start <= known_block_end:
|
|
158 known_block_end = max(block_end, known_block_end)
|
|
159 else:
|
|
160 final_blocks.append((known_block_start, known_block_end))
|
|
161 known_block_start, known_block_end = (block_start, block_end)
|
|
162
|
|
163 final_blocks.append((known_block_start, known_block_end))
|
|
164
|
|
165 absolute_block_start = final_blocks[0][0]
|
|
166
|
|
167 bsizes = [end - start for start, end in final_blocks]
|
|
168 bstarts = [start - absolute_block_start for start, end in final_blocks]
|
|
169
|
|
170 return bsizes, bstarts
|