Mercurial > repos > brenninc > umicount
diff bed12.py @ 0:d1d0ee366702 draft default tip
Uploaded first version
author | brenninc |
---|---|
date | Wed, 11 May 2016 04:53:30 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bed12.py Wed May 11 04:53:30 2016 -0400 @@ -0,0 +1,170 @@ +""" +.. module:: bed12 + :platform: Unix + :synopsis: Defines a set a generic function to parse and process bed12 files. + +.. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com> + +.. source: https://github.com/mmendez12/umicount + +""" + +__author__ = 'mickael' + + +import operator +import itertools + +def get_chrom(read): + """Get chromosome from a bed12 line. + + Args: + read: A list of twelve elements where each element refers to a field in the BED format. + + Returns: + The chromosome name + + >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] + >>> get_chrom(read) + 'chrX' + """ + return read[0] + + +def get_start(read): + """Get start position from a bed12 line. + + Args: + read: A list of twelve elements where each element refers to a field in the BED format. + + Returns: + An integer representing the start position of the read. + + >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] + >>> get_start(read) + 100 + """ + return int(read[1]) + + +def get_end(read): + """Get end position from a bed12 line. + + Args: + read: A list of twelve elements where each element refers to a field in the BED format. + + Returns: + An integer representing the end position of the read. + + >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] + >>> get_end(read) + 200 + """ + return int(read[2]) + + +def get_strand(read): + """Get strand from a bed12 line. + + Args: + read: A list of twelve elements where each element refers to a field in the BED format. + + Returns: + A single char representing the strand of a read + + >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] + >>> get_strand(read) + '+' + """ + return read[5] + +def get_tss(read): + """Get Transcription Start Site (TSS) from a bed12 line. + + Args: + read: A list of twelve elements where each element refers to a field in the BED format. + + Returns: + The start position as an integer if the read is on the plus strand. + The end position as an integer if the read is on the minus strand. + + >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] + >>> get_tss(read) + 100 + >>> read = ['chrX', '100', '200', 'toto', '12', '-', '100', '110', '255,0,0', '2', '21,25', '0,75'] + >>> get_tss(read) + 200 + """ + strand = get_strand(read) + + if strand == '+': + return get_start(read) + else: + return get_end(read) + + +def blocks_to_absolute_start_end(read): + """Calculate the absolute start and end of the blocks from a bed12 line. + + Args: + read: A list of twelve elements where each element refers to a field in the BED format. + + Returns: + A list of tuple where each tuple contains the absolute start and end coordinates of a block. + + >>> read = ['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75'] + >>> blocks_to_absolute_start_end(read) + [(100, 121), (175, 200)] + """ + read_start = get_start(read) + + block_starts = [read_start + int(start) for start in read[11].split(',') if start] + block_sizes = [int(size) for size in read[10].split(',') if size] + + block_starts_sizes = zip(block_starts, block_sizes) + + return [(bstart, bstart + bsize) for bstart, bsize in block_starts_sizes] + + +def merge_overlapping_blocks(reads): + """Merge blocks if they overlap. + + Args: + reads: A list of read in the BED12 format. + + Returns: + Two lists where the first list contains the blocks sizes and the second the blocks starts. + Values in the lists are integer. + + >>> reads = [] + >>> reads.append(['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75']) + >>> reads.append(['chrX', '100', '200', 'toto', '12', '+', '100', '110', '255,0,0', '3', '10,10,25', '0,15,75']) + >>> merge_overlapping_blocks(reads) + ([25, 25], [0, 75]) + """ + + blocks_list = [blocks_to_absolute_start_end(read) for read in reads] + + #flatten + blocks = itertools.chain.from_iterable(blocks_list) + + final_blocks = [] + + blocks = sorted(blocks, key = operator.itemgetter(0, 1)) + known_block_start, known_block_end = blocks[0] + + for block_start, block_end in blocks[1:]: + if block_start <= known_block_end: + known_block_end = max(block_end, known_block_end) + else: + final_blocks.append((known_block_start, known_block_end)) + known_block_start, known_block_end = (block_start, block_end) + + final_blocks.append((known_block_start, known_block_end)) + + absolute_block_start = final_blocks[0][0] + + bsizes = [end - start for start, end in final_blocks] + bstarts = [start - absolute_block_start for start, end in final_blocks] + + return bsizes, bstarts