comparison signature.py @ 7:07771982ef9b draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 7276b6b73aef7af4058ad2c1e34c4557e9cccbe0
author artbio
date Sun, 10 Sep 2017 13:50:40 -0400
parents a35e6f9c1d34
children 8d3ca9652a5b
comparison
equal deleted inserted replaced
6:4da23f009c9e 7:07771982ef9b
38 return args 38 return args
39 39
40 40
41 class Map: 41 class Map:
42 42
43 def __init__(self, bam_file): 43 def __init__(self, bam_file, minquery=23, maxquery=29, mintarget=23,
44 maxtarget=29, minscope=1, maxscope=19, output_h='',
45 output_z=''):
44 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') 46 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
47 self.query_range = range(minquery, maxquery + 1)
48 self.target_range = range(mintarget, maxtarget + 1)
49 self.scope = range(minscope, maxscope + 1)
50 self.H = open(output_h, 'w')
51 self.Z = open(output_z, 'w')
45 self.chromosomes = dict(zip(self.bam_object.references, 52 self.chromosomes = dict(zip(self.bam_object.references,
46 self.bam_object.lengths)) 53 self.bam_object.lengths))
47 self.map_dict = self.create_map(self.bam_object) 54 self.map_dict = self.create_map(self.bam_object)
55 self.query_positions = self.compute_query_positions()
56 self.Z.write(self.compute_signature_pairs())
57 self.H.write(self.compute_signature_h())
58 self.H.close()
59 self.Z.close()
48 60
49 def create_map(self, bam_object): 61 def create_map(self, bam_object):
50 ''' 62 '''
51 Returns a map_dictionary {(chromosome,read_position,polarity): 63 Returns a map_dictionary {(chromosome,read_position,polarity):
52 [read_length, ...]} 64 [read_length, ...]}
56 for chrom in self.chromosomes: 68 for chrom in self.chromosomes:
57 map_dictionary[(chrom, 1, 'F')] = [] 69 map_dictionary[(chrom, 1, 'F')] = []
58 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] 70 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
59 for chrom in self.chromosomes: 71 for chrom in self.chromosomes:
60 for read in bam_object.fetch(chrom): 72 for read in bam_object.fetch(chrom):
61 positions = read.positions # a list of covered positions
62 if read.is_reverse: 73 if read.is_reverse:
63 map_dictionary[(chrom, positions[-1]+1, 74 map_dictionary[(chrom, read.reference_end,
64 'R')].append(read.query_alignment_length) 75 'R')].append(read.query_alignment_length)
65 else: 76 else:
66 map_dictionary[(chrom, positions[0]+1, 77 map_dictionary[(chrom, read.reference_start+1,
67 'F')].append(read.query_alignment_length) 78 'F')].append(read.query_alignment_length)
68 return map_dictionary 79 return map_dictionary
69 80
70 def signature_tables(self, minquery, maxquery, mintarget, maxtarget): 81 def compute_query_positions(self):
71 query_range = range(minquery, maxquery + 1) 82 ''' this method does not filter on read size, just forward reads
72 target_range = range(mintarget, maxtarget + 1) 83 that overlap reverse reads in the overlap range'''
84 all_query_positions = defaultdict(list)
85 for genomicKey in self.map_dict.keys():
86 chrom, coord, pol = genomicKey
87 for i in self.scope:
88 if pol == 'F' and len(self.map_dict[chrom,
89 coord+i-1,
90 'R']) > 0:
91 all_query_positions[chrom].append(coord)
92 break
93 for chrom in all_query_positions:
94 all_query_positions[chrom] = sorted(
95 list(set(all_query_positions[chrom])))
96 return all_query_positions
97
98 def countpairs(self, uppers, lowers):
99 query_range = self.query_range
100 target_range = self.target_range
101 uppers = [size for size in uppers if size in query_range or size in
102 target_range]
103 lowers = [size for size in lowers if size in query_range or size in
104 target_range]
105 paired = []
106 for upread in uppers:
107 for downread in lowers:
108 if (upread in query_range and downread in target_range) or (
109 upread in target_range and downread in query_range):
110 paired.append(upread)
111 lowers.remove(downread)
112 break
113 return len(paired)
114
115 def compute_signature_pairs(self):
116 frequency_table = defaultdict(dict)
117 scope = self.scope
118 for chrom in self.chromosomes:
119 for overlap in scope:
120 frequency_table[chrom][overlap] = 0
121 for chrom in self.query_positions:
122 for coord in self.query_positions[chrom]:
123 for overlap in scope:
124 uppers = self.map_dict[chrom, coord, 'F']
125 lowers = self.map_dict[chrom, coord+overlap-1, 'R']
126 frequency_table[chrom][overlap] += self.countpairs(uppers,
127 lowers)
128 # compute overlaps for all chromosomes merged
129 for overlap in scope:
130 accumulator = []
131 for chrom in frequency_table:
132 if chrom != 'all_chromosomes':
133 accumulator.append(frequency_table[chrom][overlap])
134 frequency_table['all_chromosomes'][overlap] = sum(accumulator)
135 return self.stringify_table(frequency_table)
136
137 def signature_tables(self):
138 query_range = self.query_range
139 target_range = self.target_range
73 Query_table = defaultdict(dict) 140 Query_table = defaultdict(dict)
74 Target_table = defaultdict(dict) 141 Target_table = defaultdict(dict)
75 for key in self.map_dict: 142 for key in self.map_dict:
76 for size in self.map_dict[key]: 143 for size in self.map_dict[key]:
77 if size in query_range or size in target_range: 144 if size in query_range or size in target_range:
85 if size in target_range: 152 if size in target_range:
86 Target_table[key[0]][coordinate] = \ 153 Target_table[key[0]][coordinate] = \
87 Target_table[key[0]].get(coordinate, 0) + 1 154 Target_table[key[0]].get(coordinate, 0) + 1
88 return Query_table, Target_table 155 return Query_table, Target_table
89 156
90 def compute_signature_z(self, minquery, maxquery, mintarget, maxtarget, 157 def compute_signature_h(self):
91 scope, zscore="no"): 158 scope = self.scope
92 Query_table, Target_table = self.signature_tables(minquery, maxquery, 159 Query_table, Target_table = self.signature_tables()
93 mintarget, maxtarget)
94 frequency_table = defaultdict(dict)
95 for chrom in self.chromosomes:
96 for overlap in scope:
97 frequency_table[chrom][overlap] = 0
98 for chrom in Query_table:
99 for coord in Query_table[chrom]:
100 for overlap in scope:
101 frequency_table[chrom][overlap] += min(
102 Query_table[chrom][coord],
103 Target_table[chrom].get(-coord - overlap + 1, 0))
104 # since we want the number of pairs, not the number or paired reads
105 # to do: what in complex cases
106 # with query and target sizes partially overlap ?
107 for chrom in frequency_table:
108 for overlap in frequency_table[chrom]:
109 frequency_table[chrom][overlap] /= 2
110 # compute overlaps for all chromosomes merged
111 for overlap in scope:
112 accumulator = []
113 for chrom in frequency_table:
114 if chrom != 'all_chromosomes':
115 accumulator.append(frequency_table[chrom][overlap])
116 frequency_table['all_chromosomes'][overlap] = sum(accumulator)
117 return self.stringify_table(frequency_table)
118
119 def compute_signature_h(self, minquery, maxquery, mintarget,
120 maxtarget, scope):
121 Query_table, Target_table = self.signature_tables(minquery, maxquery,
122 mintarget, maxtarget)
123 frequency_table = defaultdict(dict) 160 frequency_table = defaultdict(dict)
124 for chrom in self.chromosomes: 161 for chrom in self.chromosomes:
125 for overlap in scope: 162 for overlap in scope:
126 frequency_table[chrom][overlap] = 0 163 frequency_table[chrom][overlap] = 0
127 for chrom in Query_table: 164 for chrom in Query_table:
185 str(frequency_table[chrom][overlap]), 222 str(frequency_table[chrom][overlap]),
186 str((frequency_table[chrom][overlap] - z_mean)/z_std))) 223 str((frequency_table[chrom][overlap] - z_mean)/z_std)))
187 return ''.join(tablestring) 224 return ''.join(tablestring)
188 225
189 226
190
191 def main(input, minquery, maxquery, mintarget, maxtarget, minscope, maxscope,
192 output_h, output_z, genome_wide=False, zscore="no"):
193 H = open(output_h, 'w')
194 Z = open(output_z, 'w')
195 mapobj = Map(input)
196 scope = range(minscope, maxscope + 1)
197 Z.write(mapobj.compute_signature_z(minquery, maxquery, mintarget,
198 maxtarget, scope, zscore="no"))
199 H.write(mapobj.compute_signature_h(minquery, maxquery, mintarget,
200 maxtarget, scope))
201 H.close()
202 Z.close()
203
204
205 if __name__ == "__main__": 227 if __name__ == "__main__":
206 args = Parser() 228 args = Parser()
207 main(args.input, args.minquery, args.maxquery, args.mintarget, 229 mapobj = Map(args.input, args.minquery, args.maxquery, args.mintarget,
208 args.maxtarget, args.minscope, args.maxscope, args.output_h, 230 args.maxtarget, args.minscope, args.maxscope, args.output_h,
209 args.output_z) 231 args.output_z)