comparison signature.py @ 0:a35e6f9c1d34 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 6719543c5017d581ae012b864d7c9088f0767fc8
author artbio
date Mon, 28 Aug 2017 09:29:47 -0400
parents
children 07771982ef9b
comparison
equal deleted inserted replaced
-1:000000000000 0:a35e6f9c1d34
1 import argparse
2 from collections import defaultdict
3
4 import numpy
5
6 import pysam
7
8
9 def Parser():
10 the_parser = argparse.ArgumentParser()
11 the_parser.add_argument(
12 '--input', action="store", type=str, help="bam alignment file")
13 the_parser.add_argument(
14 '--minquery', type=int,
15 help="Minimum readsize of query reads (nt) - must be an integer")
16 the_parser.add_argument(
17 '--maxquery', type=int,
18 help="Maximum readsize of query reads (nt) - must be an integer")
19 the_parser.add_argument(
20 '--mintarget', type=int,
21 help="Minimum readsize of target reads (nt) - must be an integer")
22 the_parser.add_argument(
23 '--maxtarget', type=int,
24 help="Maximum readsize of target reads (nt) - must be an integer")
25 the_parser.add_argument(
26 '--minscope', type=int,
27 help="Minimum overlap analyzed (nt) - must be an integer")
28 the_parser.add_argument(
29 '--maxscope', type=int,
30 help="Maximum overlap analyzed (nt) - must be an integer")
31 the_parser.add_argument(
32 '--output_h', action="store", type=str,
33 help="h-signature dataframe")
34 the_parser.add_argument(
35 '--output_z', action="store", type=str,
36 help="z-signature dataframe")
37 args = the_parser.parse_args()
38 return args
39
40
41 class Map:
42
43 def __init__(self, bam_file):
44 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
45 self.chromosomes = dict(zip(self.bam_object.references,
46 self.bam_object.lengths))
47 self.map_dict = self.create_map(self.bam_object)
48
49 def create_map(self, bam_object):
50 '''
51 Returns a map_dictionary {(chromosome,read_position,polarity):
52 [read_length, ...]}
53 '''
54 map_dictionary = defaultdict(list)
55 # get empty value for start and end of each chromosome
56 for chrom in self.chromosomes:
57 map_dictionary[(chrom, 1, 'F')] = []
58 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
59 for chrom in self.chromosomes:
60 for read in bam_object.fetch(chrom):
61 positions = read.positions # a list of covered positions
62 if read.is_reverse:
63 map_dictionary[(chrom, positions[-1]+1,
64 'R')].append(read.query_alignment_length)
65 else:
66 map_dictionary[(chrom, positions[0]+1,
67 'F')].append(read.query_alignment_length)
68 return map_dictionary
69
70 def signature_tables(self, minquery, maxquery, mintarget, maxtarget):
71 query_range = range(minquery, maxquery + 1)
72 target_range = range(mintarget, maxtarget + 1)
73 Query_table = defaultdict(dict)
74 Target_table = defaultdict(dict)
75 for key in self.map_dict:
76 for size in self.map_dict[key]:
77 if size in query_range or size in target_range:
78 if key[2] == 'F':
79 coordinate = key[1]
80 else:
81 coordinate = -key[1]
82 if size in query_range:
83 Query_table[key[0]][coordinate] = Query_table[key[0]].get(
84 coordinate, 0) + 1
85 if size in target_range:
86 Target_table[key[0]][coordinate] = \
87 Target_table[key[0]].get(coordinate, 0) + 1
88 return Query_table, Target_table
89
90 def compute_signature_z(self, minquery, maxquery, mintarget, maxtarget,
91 scope, zscore="no"):
92 Query_table, Target_table = self.signature_tables(minquery, maxquery,
93 mintarget, maxtarget)
94 frequency_table = defaultdict(dict)
95 for chrom in self.chromosomes:
96 for overlap in scope:
97 frequency_table[chrom][overlap] = 0
98 for chrom in Query_table:
99 for coord in Query_table[chrom]:
100 for overlap in scope:
101 frequency_table[chrom][overlap] += min(
102 Query_table[chrom][coord],
103 Target_table[chrom].get(-coord - overlap + 1, 0))
104 # since we want the number of pairs, not the number or paired reads
105 # to do: what in complex cases
106 # with query and target sizes partially overlap ?
107 for chrom in frequency_table:
108 for overlap in frequency_table[chrom]:
109 frequency_table[chrom][overlap] /= 2
110 # compute overlaps for all chromosomes merged
111 for overlap in scope:
112 accumulator = []
113 for chrom in frequency_table:
114 if chrom != 'all_chromosomes':
115 accumulator.append(frequency_table[chrom][overlap])
116 frequency_table['all_chromosomes'][overlap] = sum(accumulator)
117 return self.stringify_table(frequency_table)
118
119 def compute_signature_h(self, minquery, maxquery, mintarget,
120 maxtarget, scope):
121 Query_table, Target_table = self.signature_tables(minquery, maxquery,
122 mintarget, maxtarget)
123 frequency_table = defaultdict(dict)
124 for chrom in self.chromosomes:
125 for overlap in scope:
126 frequency_table[chrom][overlap] = 0
127 for chrom in Query_table:
128 Total_Query_Numb = 0
129 for coord in Query_table[chrom]:
130 Total_Query_Numb += Query_table[chrom][coord]
131 for coord in Query_table[chrom]:
132 local_table = dict([(overlap, 0) for overlap in scope])
133 number_of_targets = 0
134 for overlap in scope:
135 local_table[overlap] += Query_table[chrom][coord] * \
136 Target_table[chrom].get(-coord - overlap + 1, 0)
137 number_of_targets += Target_table[chrom].get(
138 -coord - overlap + 1, 0)
139 for overlap in scope:
140 try:
141 frequency_table[chrom][overlap] += \
142 local_table[overlap] / number_of_targets \
143 / float(Total_Query_Numb)
144 except ZeroDivisionError:
145 continue
146 # compute overlap probabilities for all chromosomes merged
147 general_frequency_table = dict([(overlap, 0) for overlap in scope])
148 total_aligned_reads = 0
149 for chrom in frequency_table:
150 for overlap in frequency_table[chrom]:
151 total_aligned_reads += self.bam_object.count(chrom)
152 for chrom in frequency_table:
153 for overlap in frequency_table[chrom]:
154 try:
155 general_frequency_table[overlap] += \
156 frequency_table[chrom][overlap] / total_aligned_reads \
157 * self.bam_object.count(chrom)
158 except ZeroDivisionError:
159 continue
160 for overlap in general_frequency_table:
161 frequency_table['all_chromosomes'][overlap] = \
162 general_frequency_table[overlap]
163 return self.stringify_table(frequency_table)
164
165 def stringify_table(self, frequency_table):
166 '''
167 method both to compute z-score and to return a writable string
168 '''
169 tablestring = []
170 for chrom in sorted(frequency_table):
171 accumulator = []
172 for overlap in frequency_table[chrom]:
173 accumulator.append(frequency_table[chrom][overlap])
174 z_mean = numpy.mean(accumulator)
175 z_std = numpy.std(accumulator)
176 if z_std == 0:
177 for overlap in sorted(frequency_table[chrom]):
178 tablestring.append('%s\t%s\t%s\t%s\n' % (
179 chrom, str(overlap),
180 str(frequency_table[chrom][overlap]), str(0)))
181 else:
182 for overlap in sorted(frequency_table[chrom]):
183 tablestring.append('%s\t%s\t%s\t%s\n' % (
184 chrom, str(overlap),
185 str(frequency_table[chrom][overlap]),
186 str((frequency_table[chrom][overlap] - z_mean)/z_std)))
187 return ''.join(tablestring)
188
189
190
191 def main(input, minquery, maxquery, mintarget, maxtarget, minscope, maxscope,
192 output_h, output_z, genome_wide=False, zscore="no"):
193 H = open(output_h, 'w')
194 Z = open(output_z, 'w')
195 mapobj = Map(input)
196 scope = range(minscope, maxscope + 1)
197 Z.write(mapobj.compute_signature_z(minquery, maxquery, mintarget,
198 maxtarget, scope, zscore="no"))
199 H.write(mapobj.compute_signature_h(minquery, maxquery, mintarget,
200 maxtarget, scope))
201 H.close()
202 Z.close()
203
204
205 if __name__ == "__main__":
206 args = Parser()
207 main(args.input, args.minquery, args.maxquery, args.mintarget,
208 args.maxtarget, args.minscope, args.maxscope, args.output_h,
209 args.output_z)