0
|
1 """
|
|
2 .. module:: dedup_fingerprint
|
|
3 :platform: Unix
|
|
4 :synopsis: Use UMI to count transcripts, assumes there is no barcode
|
|
5
|
|
6 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com>
|
|
7
|
|
8 .. source: https://github.com/mmendez12/umicount
|
|
9
|
|
10 """
|
|
11
|
|
12 import csv
|
|
13 import itertools
|
|
14 import subprocess
|
|
15 import argparse
|
|
16 import tempfile
|
|
17 import os
|
|
18 import shutil
|
|
19 from collections import defaultdict
|
|
20
|
|
21 import bed12
|
|
22
|
|
23
|
|
24 def get_fingerprint(read):
|
|
25 """Get fingerprint id from the read's name. It assumes that the read's name
|
|
26 contains the following pattern *FP:XXX;* where *XXX* is the fingerprint id.
|
|
27
|
|
28 Args:
|
|
29 read: A list of twelve elements where each element refers to a field in the BED format.
|
|
30
|
|
31 Returns:
|
|
32 A string containing the fingerprint id
|
|
33
|
|
34 >>> read = ['chrX', '100', '200', 'FP:0012', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
|
|
35 >>> get_fingerprint(read)
|
|
36 '0012'
|
|
37 """
|
|
38 return read[3].split('FP:')[1].split(';')[0]
|
|
39
|
|
40
|
|
41 def print_read_to_bed12(key, reads):
|
|
42 """ Merge the reads by blocks and print a single read in the BED12 format on stdout.
|
|
43 It assumes that the reads are on the same TSS and contains
|
|
44 fingerprint information in the read's name.
|
|
45
|
|
46 Args:
|
|
47 key: A tuple that contain the chromosome, barcode and fingerprint information.
|
|
48
|
|
49 reads: A list of reads (in a list) from the same TSS, that have similar barcode and fingerprint.
|
|
50
|
|
51 >>> reads = []
|
|
52 >>> reads.append(['chrX', '100', '200', 'FP:0012', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75'])
|
|
53 >>> reads.append(['chrX', '100', '300', 'FP:0012', '12', '+', '100', '110', '255,0,0', '3', '20,25', '0,175'])
|
|
54 >>> print_read_to_bed12(('chrX', '0012'), reads) #doctest: +NORMALIZE_WHITESPACE
|
|
55 chrX 100 300 FP:0012 2 + 100 120 255,0,0 3 20,25,25 0,75,175
|
|
56 """
|
|
57 block_sizes, block_starts = bed12.merge_overlapping_blocks(reads)
|
|
58
|
|
59 #bed12
|
|
60 first_read = sorted(reads, key = bed12.get_start)[0]
|
|
61 chrom, fingerprint = key
|
|
62 start = bed12.get_start(first_read)
|
|
63 end = start + block_starts[-1] + block_sizes[-1]
|
|
64 name = "FP:{0}".format(fingerprint)
|
|
65 score = len(reads)
|
|
66
|
|
67 strand = bed12.get_strand(first_read)
|
|
68
|
|
69 if strand == '+':
|
|
70 thick_start = start
|
|
71 thick_end = start + block_sizes[0]
|
|
72 else:
|
|
73 thick_start = end - block_sizes[-1]
|
|
74 thick_end = end
|
|
75
|
|
76 color = "255,0,0"
|
|
77 block_count = len(block_sizes)
|
|
78 block_sizes = ','.join(map(str, block_sizes))
|
|
79 block_starts = ','.join(map(str, block_starts))
|
|
80
|
|
81 output = [chrom, start, end, name, score, strand, thick_start, thick_end,
|
|
82 color, block_count, block_sizes, block_starts]
|
|
83
|
|
84 output_str = map(str, output)
|
|
85 print '\t'.join(output_str)
|
|
86
|
|
87
|
|
88 def main():
|
|
89
|
|
90 #PARSER TODO: move this code somewhere else
|
|
91 parser = argparse.ArgumentParser()
|
|
92 group = parser.add_mutually_exclusive_group(required=True)
|
|
93 group.add_argument("-d", "--directory", help="absolute path of the folder containing the bed files")
|
|
94 group.add_argument("-f", "--file", help="a bed file")
|
|
95 parser.add_argument("-o", help='name of the output file. Only works if the script is called with the -f option, \
|
|
96 ignored otherwise.')
|
|
97
|
|
98 args = parser.parse_args()
|
|
99
|
|
100 if args.directory:
|
|
101 path, folder, files = os.walk(args.directory).next()
|
|
102 elif args.file:
|
|
103 path = ''
|
|
104 files = [args.file]
|
|
105 #ENDPARSER
|
|
106
|
|
107 #create a temporary directory
|
|
108 tmp_dir = tempfile.mkdtemp()
|
|
109
|
|
110 plus_strand_tmp_file = open(os.path.join(tmp_dir, '+'), 'w')
|
|
111 minus_strand_tmp_file = open(os.path.join(tmp_dir, '-'), 'w')
|
|
112 plus_and_minus_sorted_path = os.path.join(tmp_dir, '+-s')
|
|
113
|
|
114 #creates two temporary bed files containing either reads on the plus or minus strand
|
|
115 for bed_file in files:
|
|
116
|
|
117 with open(os.path.join(path, bed_file)) as bed_file:
|
|
118 reader = csv.reader(bed_file, delimiter='\t')
|
|
119
|
|
120 for read in reader:
|
|
121 strand = bed12.get_strand(read)
|
|
122 if strand == '+':
|
|
123 plus_strand_tmp_file.write('\t'.join(read) + '\n')
|
|
124 elif strand == '-':
|
|
125 minus_strand_tmp_file.write('\t'.join(read) + '\n')
|
|
126
|
|
127
|
|
128 #close the files
|
|
129 plus_strand_tmp_file.close()
|
|
130 minus_strand_tmp_file.close()
|
|
131
|
|
132 #call unix sort on the file containing reads on the plus strand by tss
|
|
133 with open(os.path.join(tmp_dir, '+sorted'), "w") as outfile:
|
|
134 subprocess.call(["sort", '-k1,1', '-k2,2n', os.path.join(tmp_dir, '+')], stdout=outfile)
|
|
135
|
|
136 #call unix sort on the file containing reads on the minus strand by tss
|
|
137 with open(os.path.join(tmp_dir, '-sorted'), "w") as outfile:
|
|
138 subprocess.call(["sort", '-k1,1', '-k3,3n', os.path.join(tmp_dir, '-')], stdout=outfile)
|
|
139
|
|
140 #concatenate the files sorted by tss
|
|
141 with open(plus_and_minus_sorted_path, "w") as outfile:
|
|
142 subprocess.call(['cat', os.path.join(tmp_dir, '+sorted'), os.path.join(tmp_dir, '-sorted')], stdout=outfile)
|
|
143
|
|
144 with open(plus_and_minus_sorted_path) as bedfile:
|
|
145 reader = csv.reader(bedfile, delimiter='\t')
|
|
146 reads = (line for line in reader)
|
|
147
|
|
148 #for each reads on the same tss
|
|
149 for tss, same_tss_reads in itertools.groupby(reads, bed12.get_tss):
|
|
150 d = defaultdict(list)
|
|
151
|
|
152 #group the reads by chr and fingerprint
|
|
153 for read in same_tss_reads:
|
|
154 key = (bed12.get_chrom(read), get_fingerprint(read))
|
|
155 d[key].append(read)
|
|
156
|
|
157 #merge and print the reads that have same tss, and fingerprint
|
|
158 for key, same_fingerprint_reads in d.iteritems():
|
|
159 print_read_to_bed12(key, same_fingerprint_reads)
|
|
160
|
|
161 shutil.rmtree(tmp_dir)
|
|
162
|
|
163
|
|
164 if __name__ == '__main__':
|
|
165 main()
|
|
166
|
|
167 #TODO: combine this with dedup_barcode_fingerprint
|