comparison cage_scan_clustering.py @ 0:d1d0ee366702 draft default tip

Uploaded first version
author brenninc
date Wed, 11 May 2016 04:53:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d1d0ee366702
1 """
2 .. module:: cage_scan_clustering
3 :platform: Unix
4 :synopsis: cluster cage scan reads based on arbitrary distance. It assumes that the reads are sorted by chrom, TSS and strand.
5 :version: 1.0
6
7 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com>
8
9 .. source: https://github.com/mmendez12/umicount
10 """
11
12 import csv
13 import argparse
14
15
16 import bed12
17
18 #TODO: rewrite tests
19 def print_read_to_bed12(reads):
20 """ Merge the reads by blocks and print a single read in the BED12 format on stdout.
21 It assumes that the reads are on the same TSS and contains
22 fingerprint information in the read's name.
23
24 Args:
25 reads: A list of reads
26
27 """
28 block_sizes, block_starts = bed12.merge_overlapping_blocks(reads)
29
30 #bed12
31 first_read = sorted(reads, key=bed12.get_start)[0]
32 chrom = bed12.get_chrom(first_read)
33 start = bed12.get_start(first_read)
34 end = start + block_starts[-1] + block_sizes[-1]
35
36 score = len(reads)
37
38 strand = bed12.get_strand(first_read)
39
40 if strand == '+':
41 thick_start = start
42 thick_end = start + block_sizes[0]
43 else:
44 thick_start = end - block_sizes[-1]
45 thick_end = end
46
47 color = "255,0,0"
48 block_count = len(block_sizes)
49 block_sizes = ','.join(map(str, block_sizes))
50 block_starts = ','.join(map(str, block_starts))
51
52 name = map(str, [chrom, start, end, strand])
53 name = ":".join(name)
54
55 output = [chrom, start, end, name, score, strand, thick_start, thick_end,
56 color, block_count, block_sizes, block_starts]
57
58 output_str = map(str, output)
59 print '\t'.join(output_str)
60
61
62 def overlapping_reads(reads, distance):
63 """returns all the overlapping reads within a given distance"""
64
65 reads_list = []
66 cur_tss = 0
67 cur_chrom = ''
68
69 for read in reads:
70
71 if not cur_tss:
72 cur_tss = bed12.get_tss(read)
73 reads_list.append(read)
74 cur_chrom = bed12.get_chrom(read)
75 continue
76
77
78 tss = bed12.get_tss(read)
79 chrom = bed12.get_chrom(read)
80
81 #if not overlap
82 if (tss - cur_tss > distance) or (chrom != cur_chrom):
83 yield reads_list
84 reads_list = [read]
85 cur_tss = tss
86 cur_chrom = chrom
87 else:
88 reads_list.append(read)
89 cur_tss = tss
90
91 yield reads_list
92
93
94 def main():
95
96 parser = argparse.ArgumentParser()
97 parser.add_argument('bed_file', help='input file')
98 parser.add_argument('-t', '--tag_distance', default=20, type=int, help='cluster all the cage tags at distance d')
99
100 args = parser.parse_args()
101
102 with open(args.bed_file) as bedfile:
103
104 reader = csv.reader(bedfile, delimiter='\t')
105 reads = (line for line in reader)
106
107 #for each reads on the same tss
108 for read_list in overlapping_reads(reads, args.tag_distance):
109 print_read_to_bed12(read_list)
110
111
112 if __name__ == '__main__':
113 main()
114
115 #TODO: combine this script with fingerprint.py