Mercurial > repos > brenninc > umicount
comparison cage_scan_clustering.py @ 0:d1d0ee366702 draft default tip
Uploaded first version
author | brenninc |
---|---|
date | Wed, 11 May 2016 04:53:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d1d0ee366702 |
---|---|
1 """ | |
2 .. module:: cage_scan_clustering | |
3 :platform: Unix | |
4 :synopsis: cluster cage scan reads based on arbitrary distance. It assumes that the reads are sorted by chrom, TSS and strand. | |
5 :version: 1.0 | |
6 | |
7 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com> | |
8 | |
9 .. source: https://github.com/mmendez12/umicount | |
10 """ | |
11 | |
12 import csv | |
13 import argparse | |
14 | |
15 | |
16 import bed12 | |
17 | |
18 #TODO: rewrite tests | |
19 def print_read_to_bed12(reads): | |
20 """ Merge the reads by blocks and print a single read in the BED12 format on stdout. | |
21 It assumes that the reads are on the same TSS and contains | |
22 fingerprint information in the read's name. | |
23 | |
24 Args: | |
25 reads: A list of reads | |
26 | |
27 """ | |
28 block_sizes, block_starts = bed12.merge_overlapping_blocks(reads) | |
29 | |
30 #bed12 | |
31 first_read = sorted(reads, key=bed12.get_start)[0] | |
32 chrom = bed12.get_chrom(first_read) | |
33 start = bed12.get_start(first_read) | |
34 end = start + block_starts[-1] + block_sizes[-1] | |
35 | |
36 score = len(reads) | |
37 | |
38 strand = bed12.get_strand(first_read) | |
39 | |
40 if strand == '+': | |
41 thick_start = start | |
42 thick_end = start + block_sizes[0] | |
43 else: | |
44 thick_start = end - block_sizes[-1] | |
45 thick_end = end | |
46 | |
47 color = "255,0,0" | |
48 block_count = len(block_sizes) | |
49 block_sizes = ','.join(map(str, block_sizes)) | |
50 block_starts = ','.join(map(str, block_starts)) | |
51 | |
52 name = map(str, [chrom, start, end, strand]) | |
53 name = ":".join(name) | |
54 | |
55 output = [chrom, start, end, name, score, strand, thick_start, thick_end, | |
56 color, block_count, block_sizes, block_starts] | |
57 | |
58 output_str = map(str, output) | |
59 print '\t'.join(output_str) | |
60 | |
61 | |
62 def overlapping_reads(reads, distance): | |
63 """returns all the overlapping reads within a given distance""" | |
64 | |
65 reads_list = [] | |
66 cur_tss = 0 | |
67 cur_chrom = '' | |
68 | |
69 for read in reads: | |
70 | |
71 if not cur_tss: | |
72 cur_tss = bed12.get_tss(read) | |
73 reads_list.append(read) | |
74 cur_chrom = bed12.get_chrom(read) | |
75 continue | |
76 | |
77 | |
78 tss = bed12.get_tss(read) | |
79 chrom = bed12.get_chrom(read) | |
80 | |
81 #if not overlap | |
82 if (tss - cur_tss > distance) or (chrom != cur_chrom): | |
83 yield reads_list | |
84 reads_list = [read] | |
85 cur_tss = tss | |
86 cur_chrom = chrom | |
87 else: | |
88 reads_list.append(read) | |
89 cur_tss = tss | |
90 | |
91 yield reads_list | |
92 | |
93 | |
94 def main(): | |
95 | |
96 parser = argparse.ArgumentParser() | |
97 parser.add_argument('bed_file', help='input file') | |
98 parser.add_argument('-t', '--tag_distance', default=20, type=int, help='cluster all the cage tags at distance d') | |
99 | |
100 args = parser.parse_args() | |
101 | |
102 with open(args.bed_file) as bedfile: | |
103 | |
104 reader = csv.reader(bedfile, delimiter='\t') | |
105 reads = (line for line in reader) | |
106 | |
107 #for each reads on the same tss | |
108 for read_list in overlapping_reads(reads, args.tag_distance): | |
109 print_read_to_bed12(read_list) | |
110 | |
111 | |
112 if __name__ == '__main__': | |
113 main() | |
114 | |
115 #TODO: combine this script with fingerprint.py |