comparison small_rna_clusters.py @ 0:8028521b6e4f draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
author artbio
date Mon, 07 Oct 2019 12:51:25 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:8028521b6e4f
1 import argparse
2 from collections import defaultdict
3
4 import pysam
5
6
7 def Parser():
8 the_parser = argparse.ArgumentParser()
9 the_parser.add_argument('--inputs', dest='inputs', required=True,
10 nargs='+', help='list of input BAM files')
11 the_parser.add_argument('--minsize', dest='minsize', type=int,
12 default=19, help='minimal size of reads')
13 the_parser.add_argument('--maxsize', dest='maxsize', type=int,
14 default=29, help='maximal size of reads')
15 the_parser.add_argument('--cluster', dest='cluster', type=int,
16 default=0, help='clustering distance')
17 the_parser.add_argument('--sample_names', dest='sample_names',
18 required=True, nargs='+',
19 help='list of sample names')
20 the_parser.add_argument('--bed', dest='bed', required=False,
21 help='Name of bed output must be specified\
22 if --cluster option used')
23 the_parser.add_argument('--bed_skipsize', dest='bed_skipsize',
24 required=False, type=int, default=1,
25 help='Skip clusters of size equal or less than\
26 specified integer in the bed output. \
27 Default = 0, not skipping')
28 the_parser.add_argument('--bed_skipdensity', dest='bed_skipdensity',
29 required=False, type=float, default=0,
30 help='Skip clusters of density equal or less than\
31 specified float number in the bed output. \
32 Default = 0, not skipping')
33 the_parser.add_argument('--bed_skipcounts', dest='bed_skipcounts',
34 required=False, type=int, default=1,
35 help='Skip clusters of size equal or less than\
36 specified integer in the bed output. \
37 Default = 0, not skipping')
38 the_parser.add_argument('--outputs', action='store',
39 help='list of two output paths (only two)')
40 the_parser.add_argument('--nostrand', action='store_true',
41 help='Consider reads regardless their polarity')
42
43 args = the_parser.parse_args()
44 return args
45
46
47 class Map:
48
49 def __init__(self, bam_file, sample, minsize, maxsize, cluster, nostrand):
50 self.sample_name = sample
51 self.minsize = minsize
52 self.maxsize = maxsize
53 self.cluster = cluster
54 if not nostrand:
55 self.nostrand = False
56 else:
57 self.nostrand = True
58 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
59 self.chromosomes = dict(zip(self.bam_object.references,
60 self.bam_object.lengths))
61 self.map_dict = self.create_map(self.bam_object, self.nostrand)
62 if self.cluster:
63 self.map_dict = self.tile_map(self.map_dict, self.cluster)
64
65 def create_map(self, bam_object, nostrand=False):
66 '''
67 Returns a map_dictionary {(chromosome,read_position,polarity):
68 [read_length, ...]}
69 '''
70 map_dictionary = defaultdict(list)
71 for chrom in self.chromosomes:
72 # get empty value for start and end of each chromosome
73 map_dictionary[(chrom, 1, 'F')] = []
74 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
75 if not nostrand:
76 for read in bam_object.fetch(chrom):
77 positions = read.positions # a list of covered positions
78 if read.is_reverse:
79 map_dictionary[(chrom, positions[-1]+1, 'R')].append(
80 read.query_alignment_length)
81 else:
82 map_dictionary[(chrom, positions[0]+1, 'F')].append(
83 read.query_alignment_length)
84 else:
85 for read in bam_object.fetch(chrom):
86 positions = read.positions # a list of covered positions
87 map_dictionary[(chrom, positions[0]+1, 'F')].append(
88 read.query_alignment_length)
89 return map_dictionary
90
91 def grouper(self, iterable, clust_distance):
92 prev = None
93 group = []
94 for item in iterable:
95 if not prev or item - prev <= clust_distance:
96 group.append(item)
97 else:
98 yield group
99 group = [item]
100 prev = item
101 if group:
102 yield group
103
104 def tile_map(self, map_dic, clust_distance):
105 '''
106 takes a map_dictionary {(chromosome,read_position,polarity):
107 [read_length, ...]}
108 and returns a map_dictionary with structure:
109 {(chromosome,read_position,polarity):
110 [*counts*, [start_clust, end_clust]]}
111 '''
112 clustered_dic = defaultdict(list)
113 for chrom in self.chromosomes:
114 F_chrom_coord = []
115 R_chrom_coord = []
116 for key in map_dic:
117 if key[0] == chrom and key[2] == 'F':
118 F_chrom_coord.append(key[1])
119 elif key[0] == chrom and key[2] == 'R':
120 R_chrom_coord.append(key[1])
121 F_chrom_coord = list(set(F_chrom_coord))
122 R_chrom_coord = list(set(R_chrom_coord))
123 F_chrom_coord.sort()
124 R_chrom_coord.sort()
125 F_clust_values = [i for i in self.grouper(F_chrom_coord,
126 clust_distance)]
127 F_clust_keys = [(i[-1]+i[0])/2 for i in F_clust_values]
128 R_clust_values = [i for i in self.grouper(R_chrom_coord,
129 clust_distance)]
130 R_clust_keys = [(i[-1]+i[0])/2 for i in R_clust_values]
131 # now 2 dictionnaries (F and R) with structure:
132 # {centered_coordinate: [coord1, coord2, coord3, ..]}
133 F_clust_dic = dict(zip(F_clust_keys, F_clust_values))
134 R_clust_dic = dict(zip(R_clust_keys, R_clust_values))
135 for centcoor in F_clust_dic:
136 accumulator = []
137 for coor in F_clust_dic[centcoor]:
138 accumulator.extend(map_dic[(chrom, coor, 'F')])
139 '''
140 compute the offset of the cluster due to
141 size of reads
142 '''
143 last = sorted(F_clust_dic[centcoor])[-1]
144 try:
145 margin = max(map_dic[(chrom, last, 'F')]) - 1
146 except ValueError:
147 margin = 0
148 clustered_dic[(chrom, centcoor, 'F')] = [len(accumulator), [
149 F_clust_dic[centcoor][0],
150 F_clust_dic[centcoor][-1] + margin]]
151 for centcoor in R_clust_dic:
152 accumulator = []
153 for coor in R_clust_dic[centcoor]:
154 accumulator.extend(map_dic[(chrom, coor, 'R')])
155 '''
156 compute the offset of the cluster due to
157 size of reads
158 '''
159 first = sorted(R_clust_dic[centcoor])[0]
160 try:
161 margin = max(map_dic[(chrom, first, 'R')]) - 1
162 except ValueError:
163 margin = 0
164 clustered_dic[(chrom, centcoor, 'R')] = [len(accumulator), [
165 R_clust_dic[centcoor][0] - margin,
166 R_clust_dic[centcoor][-1]]]
167 return clustered_dic
168
169 def write_table(self, mapdict, out):
170 '''
171 Writer of a tabular file
172 Dataset, Chromosome, Chrom_length, Coordinate, Polarity,
173 <some mapped value>
174 out is an *open* file handler
175 '''
176 for key in sorted(mapdict):
177 line = [self.sample_name, key[0], self.chromosomes[key[0]],
178 key[1], key[2], mapdict[key]]
179 line = [str(i) for i in line]
180 out.write('\t'.join(line) + '\n')
181
182 def write_cluster_table(self, clustered_dic, out, bedpath):
183 '''
184 Writer of a tabular file
185 Dataset, Chromosome, Chrom_length, Coordinate, Polarity,
186 <some mapped value>
187 out is an *open* file handler
188 bed is an a file handler internal to the function
189 '''
190 def filterCluster(size, count, density):
191 if size < args.bed_skipsize:
192 return False
193 if count < args.bed_skipcounts:
194 return False
195 if density <= args.bed_skipdensity:
196 return False
197 return True
198 bed = open(bedpath, 'w')
199 clusterid = 0
200 for key in sorted(clustered_dic):
201 start = clustered_dic[key][1][0]
202 end = clustered_dic[key][1][1]
203 size = end - start + 1
204 read_count = clustered_dic[key][0]
205 if self.nostrand:
206 polarity = '.'
207 elif key[2] == 'F':
208 polarity = '+'
209 else:
210 polarity = '-'
211 density = float(read_count) / size
212 line = [self.sample_name, key[0], self.chromosomes[key[0]],
213 key[1], key[2], read_count,
214 str(start) + "-" + str(end), str(size), str(density)]
215 line = [str(i) for i in line]
216 out.write('\t'.join(line) + '\n')
217 if filterCluster(size, read_count, density):
218 clusterid += 1
219 name = 'cluster_' + str(clusterid)
220 bedline = [key[0], str(start-1), str(end), name,
221 str(read_count), polarity, str(density)]
222 bed.write('\t'.join(bedline) + '\n')
223 print("number of reported clusters:", clusterid)
224 bed.close()
225
226
227 def main(inputs, samples, outputs, minsize, maxsize, cluster,
228 nostrand, bedfile=None, bed_skipsize=0):
229 out = open(outputs, 'w')
230 header = ["# Dataset", "Chromosome", "Chrom_length", "Coordinate",
231 "Polarity", "Counts", "Start-End", "Cluster Size", "density"]
232 out.write('\t'.join(header) + '\n')
233 for input, sample in zip(inputs, samples):
234 mapobj = Map(input, sample, minsize, maxsize, cluster, nostrand)
235 mapobj.write_cluster_table(mapobj.map_dict, out, bedfile)
236 out.close()
237
238
239 if __name__ == "__main__":
240 args = Parser()
241 # if identical sample names
242 if len(set(args.sample_names)) != len(args.sample_names):
243 args.sample_names = [name + '_' + str(i) for
244 i, name in enumerate(args.sample_names)]
245 main(args.inputs, args.sample_names, args.outputs,
246 args.minsize, args.maxsize, args.cluster, args.nostrand, args.bed)