comparison bed12.py @ 0:d1d0ee366702 draft default tip

Uploaded first version
author brenninc
date Wed, 11 May 2016 04:53:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d1d0ee366702
1 """
2 .. module:: bed12
3 :platform: Unix
4 :synopsis: Defines a set a generic function to parse and process bed12 files.
5
6 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com>
7
8 .. source: https://github.com/mmendez12/umicount
9
10 """
11
12 __author__ = 'mickael'
13
14
15 import operator
16 import itertools
17
18 def get_chrom(read):
19 """Get chromosome from a bed12 line.
20
21 Args:
22 read: A list of twelve elements where each element refers to a field in the BED format.
23
24 Returns:
25 The chromosome name
26
27 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
28 >>> get_chrom(read)
29 'chrX'
30 """
31 return read[0]
32
33
34 def get_start(read):
35 """Get start position from a bed12 line.
36
37 Args:
38 read: A list of twelve elements where each element refers to a field in the BED format.
39
40 Returns:
41 An integer representing the start position of the read.
42
43 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
44 >>> get_start(read)
45 100
46 """
47 return int(read[1])
48
49
50 def get_end(read):
51 """Get end position from a bed12 line.
52
53 Args:
54 read: A list of twelve elements where each element refers to a field in the BED format.
55
56 Returns:
57 An integer representing the end position of the read.
58
59 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
60 >>> get_end(read)
61 200
62 """
63 return int(read[2])
64
65
66 def get_strand(read):
67 """Get strand from a bed12 line.
68
69 Args:
70 read: A list of twelve elements where each element refers to a field in the BED format.
71
72 Returns:
73 A single char representing the strand of a read
74
75 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
76 >>> get_strand(read)
77 '+'
78 """
79 return read[5]
80
81 def get_tss(read):
82 """Get Transcription Start Site (TSS) from a bed12 line.
83
84 Args:
85 read: A list of twelve elements where each element refers to a field in the BED format.
86
87 Returns:
88 The start position as an integer if the read is on the plus strand.
89 The end position as an integer if the read is on the minus strand.
90
91 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
92 >>> get_tss(read)
93 100
94 >>> read = ['chrX', '100', '200', 'toto', '12', '-', '100', '110', '255,0,0', '2', '21,25', '0,75']
95 >>> get_tss(read)
96 200
97 """
98 strand = get_strand(read)
99
100 if strand == '+':
101 return get_start(read)
102 else:
103 return get_end(read)
104
105
106 def blocks_to_absolute_start_end(read):
107 """Calculate the absolute start and end of the blocks from a bed12 line.
108
109 Args:
110 read: A list of twelve elements where each element refers to a field in the BED format.
111
112 Returns:
113 A list of tuple where each tuple contains the absolute start and end coordinates of a block.
114
115 >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
116 >>> blocks_to_absolute_start_end(read)
117 [(100, 121), (175, 200)]
118 """
119 read_start = get_start(read)
120
121 block_starts = [read_start + int(start) for start in read[11].split(',') if start]
122 block_sizes = [int(size) for size in read[10].split(',') if size]
123
124 block_starts_sizes = zip(block_starts, block_sizes)
125
126 return [(bstart, bstart + bsize) for bstart, bsize in block_starts_sizes]
127
128
129 def merge_overlapping_blocks(reads):
130 """Merge blocks if they overlap.
131
132 Args:
133 reads: A list of read in the BED12 format.
134
135 Returns:
136 Two lists where the first list contains the blocks sizes and the second the blocks starts.
137 Values in the lists are integer.
138
139 >>> reads = []
140 >>> reads.append(['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75'])
141 >>> reads.append(['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '3', '10,10,25', '0,15,75'])
142 >>> merge_overlapping_blocks(reads)
143 ([25, 25], [0, 75])
144 """
145
146 blocks_list = [blocks_to_absolute_start_end(read) for read in reads]
147
148 #flatten
149 blocks = itertools.chain.from_iterable(blocks_list)
150
151 final_blocks = []
152
153 blocks = sorted(blocks, key = operator.itemgetter(0, 1))
154 known_block_start, known_block_end = blocks[0]
155
156 for block_start, block_end in blocks[1:]:
157 if block_start <= known_block_end:
158 known_block_end = max(block_end, known_block_end)
159 else:
160 final_blocks.append((known_block_start, known_block_end))
161 known_block_start, known_block_end = (block_start, block_end)
162
163 final_blocks.append((known_block_start, known_block_end))
164
165 absolute_block_start = final_blocks[0][0]
166
167 bsizes = [end - start for start, end in final_blocks]
168 bstarts = [start - absolute_block_start for start, end in final_blocks]
169
170 return bsizes, bstarts