comparison dedup_barcode_fingerprint.py @ 0:d1d0ee366702 draft default tip

Uploaded first version
author brenninc
date Wed, 11 May 2016 04:53:30 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d1d0ee366702
1 """
2 .. module:: fingerprint
3 :platform: Unix
4 :synopsis: Use UMI to count transcripts
5 :version: 1.0
6
7 .. moduleauthor:: Mickael Mendez <mendez.mickael@gmail.com>
8
9 .. source: https://github.com/mmendez12/umicount
10
11 """
12
13 import csv
14 import itertools
15 import subprocess
16 import argparse
17 import tempfile
18 import os
19 import shutil
20 from collections import defaultdict
21
22 import bed12
23
24
25 def get_fingerprint(read):
26 """Get fingerprint id from the read's name. It assumes that the read's name
27 contains the following pattern *FP:XXX;* where *XXX* is the fingerprint id.
28
29 Args:
30 read: A list of twelve elements where each element refers to a field in the BED format.
31
32 Returns:
33 A string containing the fingerprint id
34
35 >>> read = ['chrX', '100', '200', 'BC:ATGC;FP:0012', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
36 >>> get_fingerprint(read)
37 '0012'
38 """
39 return read[3].split('FP:')[1].split(';')[0]
40
41
42 def get_barcode(read):
43 """Get barcode from the read's name. It assumes that the read's name
44 contains the following pattern *BC:XXX;* where *XXX* is the actual barcode.
45
46 Args:
47 read: A list of twelve elements where each element refers to a field in the BED format.
48
49 Returns:
50 A string containing the barcode
51
52 >>> read = ['chrX', '100', '200', 'BC:ATGC;FP:0012', '12', '+', '100', '110', '255,0,0', '2', '21,25', '0,75']
53 >>> get_barcode(read)
54 'ATGC'
55 """
56 return read[3].split('BC:')[1].split(';')[0]
57
58
59 def print_read_to_bed12(key, reads):
60 """ Merge the reads by blocks and print a single read in the BED12 format on stdout.
61 It assumes that the reads are on the same TSS and contains
62 barcode and fingerprint information in the read's name.
63
64 Args:
65 key: A tuple that contain the chromosome, barcode and fingerprint information.
66
67 reads: A list of reads (in a list) from the same TSS, that have similar barcode and fingerprint.
68
69 >>> reads = []
70 >>> reads.append(['chrX', '100', '200', 'BC:AAA;FP:0012', '12', '+', '100', '110', '255,0,0', '2', '20,25', '0,75'])
71 >>> reads.append(['chrX', '100', '300', 'BC:AAA;FP:0012', '12', '+', '100', '110', '255,0,0', '3', '20,25', '0,175'])
72 >>> print_read_to_bed12(('chrX', 'AAA', '0012'), reads) #doctest: +NORMALIZE_WHITESPACE
73 chrX 100 300 BC:AAA;FP:0012 2 + 100 120 255,0,0 3 20,25,25 0,75,175
74 """
75 block_sizes, block_starts = bed12.merge_overlapping_blocks(reads)
76
77 #bed12
78 first_read = sorted(reads, key = bed12.get_start)[0]
79 chrom, barcode, fingerprint = key
80 start = bed12.get_start(first_read)
81 end = start + block_starts[-1] + block_sizes[-1]
82 name = "BC:{0};FP:{1}".format(barcode, fingerprint)
83 score = len(reads)
84
85 strand = bed12.get_strand(first_read)
86
87 if strand == '+':
88 thick_start = start
89 thick_end = start + block_sizes[0]
90 else:
91 thick_start = end - block_sizes[-1]
92 thick_end = end
93
94 color = "255,0,0"
95 block_count = len(block_sizes)
96 block_sizes = ','.join(map(str, block_sizes))
97 block_starts = ','.join(map(str, block_starts))
98
99 output = [chrom, start, end, name, score, strand, thick_start, thick_end,
100 color, block_count, block_sizes, block_starts]
101
102 output_str = map(str, output)
103 print '\t'.join(output_str)
104
105
106 def main():
107
108 #PARSER TODO: move this code somewhere else
109 parser = argparse.ArgumentParser()
110 group = parser.add_mutually_exclusive_group(required=True)
111 group.add_argument("-d", "--directory", help="absolute path of the folder containing the bed files")
112 group.add_argument("-f", "--file", help="a bed file")
113 parser.add_argument("-o", help='name of the output file. Only works if the script is called with the -f option, \
114 ignored otherwise.')
115
116 args = parser.parse_args()
117
118 if args.directory:
119 path, folder, files = os.walk(args.directory).next()
120 elif args.file:
121 path = ''
122 files = [args.file]
123 #ENDPARSER
124
125 #create a temporary directory
126 tmp_dir = tempfile.mkdtemp()
127
128 plus_strand_tmp_file = open(os.path.join(tmp_dir, '+'), 'w')
129 minus_strand_tmp_file = open(os.path.join(tmp_dir, '-'), 'w')
130 plus_and_minus_sorted_path = os.path.join(tmp_dir, '+-s')
131
132 #creates two temporary bed files containing either reads on the plus or minus strand
133 for bed_file in files:
134
135 with open(os.path.join(path, bed_file)) as bed_file:
136 reader = csv.reader(bed_file, delimiter='\t')
137
138 for read in reader:
139 strand = bed12.get_strand(read)
140 if strand == '+':
141 plus_strand_tmp_file.write('\t'.join(read) + '\n')
142 elif strand == '-':
143 minus_strand_tmp_file.write('\t'.join(read) + '\n')
144
145
146 #close the files
147 plus_strand_tmp_file.close()
148 minus_strand_tmp_file.close()
149
150 #call unix sort on the file containing reads on the plus strand by tss
151 with open(os.path.join(tmp_dir, '+sorted'), "w") as outfile:
152 subprocess.call(["sort", '-k2,2n', os.path.join(tmp_dir, '+')], stdout=outfile)
153
154 #call unix sort on the file containing reads on the minus strand by tss
155 with open(os.path.join(tmp_dir, '-sorted'), "w") as outfile:
156 subprocess.call(["sort", '-k3,3n', os.path.join(tmp_dir, '-')], stdout=outfile)
157
158 #concatenate the files sorted by tss
159 with open(plus_and_minus_sorted_path, "w") as outfile:
160 subprocess.call(['cat', os.path.join(tmp_dir, '+sorted'), os.path.join(tmp_dir, '-sorted')], stdout=outfile)
161
162 with open(plus_and_minus_sorted_path) as bedfile:
163 reader = csv.reader(bedfile, delimiter='\t')
164 reads = (line for line in reader)
165
166 #for each reads on the same tss
167 for tss, reads in itertools.groupby(reads, bed12.get_tss):
168 d = defaultdict(list)
169
170 #group the reads by chr, barcode and fingerprint
171 for read in reads:
172 key = (bed12.get_chrom(read), get_barcode(read), get_fingerprint(read))
173 d[key].append(read)
174
175 #merge and print the reads that have similar tss, barcode and fingerprint
176 for key, reads in d.iteritems():
177 print_read_to_bed12(key, reads)
178
179 shutil.rmtree(tmp_dir)
180
181
182 if __name__ == '__main__':
183 main()
184
185 #TODO: combine this script with dedup_fingerprint