Mercurial > repos > brenninc > umicount
diff cage_scan_clustering.py @ 0:d1d0ee366702 draft default tip
Uploaded first version
author | brenninc |
---|---|
date | Wed, 11 May 2016 04:53:30 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cage_scan_clustering.py Wed May 11 04:53:30 2016 -0400 @@ -0,0 +1,115 @@ +""" +.. module:: cage_scan_clustering + :platform: Unix + :synopsis: cluster cage scan reads based on arbitrary distance. It assumes that the reads are sorted by chrom, TSS and strand. + :version: 1.0 + +.. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com> + +.. source: https://github.com/mmendez12/umicount +""" + +import csv +import argparse + + +import bed12 + +#TODO: rewrite tests +def print_read_to_bed12(reads): + """ Merge the reads by blocks and print a single read in the BED12 format on stdout. + It assumes that the reads are on the same TSS and contains + fingerprint information in the read's name. + + Args: + reads: A list of reads + + """ + block_sizes, block_starts = bed12.merge_overlapping_blocks(reads) + + #bed12 + first_read = sorted(reads, key=bed12.get_start)[0] + chrom = bed12.get_chrom(first_read) + start = bed12.get_start(first_read) + end = start + block_starts[-1] + block_sizes[-1] + + score = len(reads) + + strand = bed12.get_strand(first_read) + + if strand == '+': + thick_start = start + thick_end = start + block_sizes[0] + else: + thick_start = end - block_sizes[-1] + thick_end = end + + color = "255,0,0" + block_count = len(block_sizes) + block_sizes = ','.join(map(str, block_sizes)) + block_starts = ','.join(map(str, block_starts)) + + name = map(str, [chrom, start, end, strand]) + name = ":".join(name) + + output = [chrom, start, end, name, score, strand, thick_start, thick_end, + color, block_count, block_sizes, block_starts] + + output_str = map(str, output) + print '\t'.join(output_str) + + +def overlapping_reads(reads, distance): + """returns all the overlapping reads within a given distance""" + + reads_list = [] + cur_tss = 0 + cur_chrom = '' + + for read in reads: + + if not cur_tss: + cur_tss = bed12.get_tss(read) + reads_list.append(read) + cur_chrom = bed12.get_chrom(read) + continue + + + tss = bed12.get_tss(read) + chrom = bed12.get_chrom(read) + + #if not overlap + if (tss - cur_tss > distance) or (chrom != cur_chrom): + yield reads_list + reads_list = [read] + cur_tss = tss + cur_chrom = chrom + else: + reads_list.append(read) + cur_tss = tss + + yield reads_list + + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument('bed_file', help='input file') + parser.add_argument('-t', '--tag_distance', default=20, type=int, help='cluster all the cage tags at distance d') + + args = parser.parse_args() + + with open(args.bed_file) as bedfile: + + reader = csv.reader(bedfile, delimiter='\t') + reads = (line for line in reader) + + #for each reads on the same tss + for read_list in overlapping_reads(reads, args.tag_distance): + print_read_to_bed12(read_list) + + +if __name__ == '__main__': + main() + +#TODO: combine this script with fingerprint.py