comparison small_rna_maps.py @ 0:6d48150495e3 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit d4d8106d66b65679a1a685ab94bfcf99cdb7b959
author artbio
date Mon, 24 Jul 2017 06:28:45 -0400
parents
children 40972a8dfab9
comparison
equal deleted inserted replaced
-1:000000000000 0:6d48150495e3
1 import argparse
2 from collections import defaultdict
3
4 import numpy
5
6 import pysam
7
8
9 def Parser():
10 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument('--input', dest='input', required=True,
12 nargs='+', help='input BAM files')
13 the_parser.add_argument('--sample_name', dest='sample_name',
14 required=True, nargs='+', help='sample name')
15 the_parser.add_argument('--output', action='store',
16 type=str, help='output tabular file')
17 the_parser.add_argument('-S', '--sizes', action='store',
18 help='use to output read sizes dataframe')
19 args = the_parser.parse_args()
20 return args
21
22
23 class Map:
24
25 def __init__(self, bam_file, sample, computeSize=False):
26 self.sample_name = sample
27 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
28 self.chromosomes = dict(zip(self.bam_object.references,
29 self.bam_object.lengths))
30 self.map_dict = self.create_map(self.bam_object)
31 self.max = self.compute_max(self.map_dict)
32 self.mean = self.compute_mean(self.map_dict)
33 self.median = self.compute_median(self.map_dict)
34 self.coverage = self.compute_coverage(self.map_dict)
35 if computeSize:
36 self.size = self.compute_size(self.map_dict)
37
38 def create_map(self, bam_object):
39 '''
40 Returns a map_dictionary {(chromosome,read_position,polarity):
41 [read_length, ...]}
42 '''
43 map_dictionary = defaultdict(list)
44 # get empty value for start and end of each chromosome
45 for chrom in self.chromosomes:
46 map_dictionary[(chrom, 1, 'F')] = []
47 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
48 for chrom in self.chromosomes:
49 for read in bam_object.fetch(chrom):
50 positions = read.positions # a list of covered positions
51 for pos in positions:
52 if not map_dictionary[(chrom, pos+1, 'F')]:
53 map_dictionary[(chrom, pos+1, 'F')] = []
54 if not map_dictionary[(chrom, pos+1, 'R')]:
55 map_dictionary[(chrom, pos+1, 'R')] = []
56 if read.is_reverse:
57 map_dictionary[(chrom, positions[-1]+1,
58 'R')].append(read.query_alignment_length)
59 else:
60 map_dictionary[(chrom, positions[0]+1,
61 'F')].append(read.query_alignment_length)
62 return map_dictionary
63
64 def compute_max(self, map_dictionary):
65 '''
66 takes a map_dictionary as input and returns
67 a max_dictionary {(chromosome,read_position,polarity):
68 max_of_number_of_read_at_any_position}
69 '''
70 merge_keylist = [(i[0], 0) for i in map_dictionary.keys()]
71 max_dictionary = dict(merge_keylist)
72 for key in map_dictionary:
73 if len(map_dictionary[key]) > max_dictionary[key[0]]:
74 max_dictionary[key[0]] = len(map_dictionary[key])
75 return max_dictionary
76
77 def compute_mean(self, map_dictionary):
78 '''
79 takes a map_dictionary as input and returns
80 a mean_dictionary {(chromosome,read_position,polarity):
81 mean_value_of_reads}
82 '''
83 mean_dictionary = dict()
84 for key in map_dictionary:
85 if len(map_dictionary[key]) == 0:
86 mean_dictionary[key] = 0
87 else:
88 mean_dictionary[key] = round(numpy.mean(map_dictionary[key]),
89 1)
90 return mean_dictionary
91
92 def compute_median(self, map_dictionary):
93 '''
94 takes a map_dictionary as input and returns
95 a mean_dictionary {(chromosome,read_position,polarity):
96 mean_value_of_reads}
97 '''
98 median_dictionary = dict()
99 for key in map_dictionary:
100 if len(map_dictionary[key]) == 0:
101 median_dictionary[key] = 0
102 else:
103 median_dictionary[key] = numpy.median(map_dictionary[key])
104 return median_dictionary
105
106 def compute_coverage(self, map_dictionary, quality=10):
107 '''
108 takes a map_dictionary as input and returns
109 a coverage_dictionary {(chromosome,read_position,polarity):
110 coverage}
111 '''
112 coverage_dictionary = dict()
113 for chrom in self.chromosomes:
114 coverage_dictionary[(chrom, 1, 'F')] = 0
115 coverage_dictionary[(chrom, self.chromosomes[chrom], 'F')] = 0
116
117 for key in map_dictionary:
118 coverage = self.bam_object.count_coverage(
119 reference=key[0],
120 start=key[1]-1,
121 end=key[1],
122 quality_threshold=quality)
123 """ Add the 4 coverage values """
124 coverage = [sum(x) for x in zip(*coverage)]
125 coverage_dictionary[key] = coverage[0]
126 # coverage_dictionary[(key[0], key[1], 'R')] = coverage
127 return coverage_dictionary
128
129 def compute_size(self, map_dictionary):
130 '''
131 Takes a map_dictionary and returns a dictionary of sizes:
132 {chrom: {polarity: {size: nbre of reads}}}
133 '''
134 size_dictionary = defaultdict(lambda: defaultdict(
135 lambda: defaultdict( int )))
136 # to track empty chromosomes
137 for chrom in self.chromosomes:
138 if self.bam_object.count(chrom) == 0:
139 size_dictionary[chrom]['F'][10] = 0
140 for key in map_dictionary:
141 for size in map_dictionary[key]:
142 size_dictionary[key[0]][key[2]][size] += 1
143 return size_dictionary
144
145 def write_size_table(self, out):
146 '''
147 Dataset, Chromosome, Polarity, Size, Nbr_reads
148 out is an *open* file handler
149 '''
150 for chrom in sorted(self.size):
151 sizes = self.size[chrom]['F'].keys()
152 sizes.extend(self.size[chrom]['R'].keys())
153 for polarity in sorted(self.size[chrom]):
154 for size in range(min(sizes), max(sizes)+1):
155 try:
156 line = [self.sample_name, chrom, polarity, size,
157 self.size[chrom][polarity][size]]
158 except KeyError:
159 line = [self.sample_name, chrom, polarity, size, 0]
160 line = [str(i) for i in line]
161 out.write('\t'.join(line) + '\n')
162
163 def write_table(self, out):
164 '''
165 Dataset, Chromosome, Chrom_length, Coordinate, Nbr_reads
166 Polarity, Max, Mean, Median, Coverage
167 out is an *open* file handler
168 '''
169 for key in sorted(self.map_dict):
170 line = [self.sample_name, key[0], self.chromosomes[key[0]],
171 key[1], len(self.map_dict[key]), key[2], self.max[key[0]],
172 self.mean[key], self.median[key], self.coverage[key]]
173 line = [str(i) for i in line]
174 out.write('\t'.join(line) + '\n')
175
176
177 def main(inputs, samples, file_out, size_file_out=''):
178 F = open(file_out, 'w')
179 header = ["Dataset", "Chromosome", "Chrom_length", "Coordinate",
180 "Nbr_reads", "Polarity", "Max", "Mean", "Median", "Coverage"]
181 F.write('\t'.join(header) + '\n')
182 if size_file_out:
183 Fs = open(size_file_out, 'w')
184 header = ["Dataset", "Chromosome", "Polarity", "Size", "Nbr_reads"]
185 Fs.write('\t'.join(header) + '\n')
186 for file, sample in zip(inputs, samples):
187 mapobj = Map(file, sample, computeSize=True)
188 mapobj.write_table(F)
189 mapobj.write_size_table(Fs)
190 Fs.close()
191 else:
192 for file, sample in zip(inputs, samples):
193 mapobj = Map(file, sample, computeSize=False)
194 mapobj.write_table(F)
195 F.close()
196
197
198 if __name__ == "__main__":
199 args = Parser()
200 # if identical sample names
201 if len(set(args.sample_name)) != len(args.sample_name):
202 args.sample_name = [name + '_' + str(i) for
203 i, name in enumerate(args.sample_name)]
204 main(args.input, args.sample_name, args.output, args.sizes)