comparison overlapping_reads.py @ 1:6f1378738798 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 6133bb114c76a795fa12a4a11edb1a8b80fd104d
author artbio
date Tue, 29 Aug 2017 20:02:15 -0400
parents
children 4d9682bd3a6b
comparison
equal deleted inserted replaced
0:a35e6f9c1d34 1:6f1378738798
1 import argparse
2 from collections import defaultdict
3
4 import pysam
5
6
7 def Parser():
8 the_parser = argparse.ArgumentParser()
9 the_parser.add_argument(
10 '--input', action="store", type=str, help="bam alignment file")
11 the_parser.add_argument(
12 '--minquery', type=int,
13 help="Minimum readsize of query reads (nt) - must be an integer")
14 the_parser.add_argument(
15 '--maxquery', type=int,
16 help="Maximum readsize of query reads (nt) - must be an integer")
17 the_parser.add_argument(
18 '--mintarget', type=int,
19 help="Minimum readsize of target reads (nt) - must be an integer")
20 the_parser.add_argument(
21 '--maxtarget', type=int,
22 help="Maximum readsize of target reads (nt) - must be an integer")
23 the_parser.add_argument(
24 '--overlap', type=int,
25 help="Overlap analyzed (nt) - must be an integer")
26 the_parser.add_argument(
27 '--output', action="store", type=str,
28 help="Pairable sequences")
29 args = the_parser.parse_args()
30 return args
31
32
33 class Map:
34
35 def __init__(self, bam_file):
36 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
37 self.chromosomes = dict(zip(self.bam_object.references,
38 self.bam_object.lengths))
39 self.map_dict = self.create_map(self.bam_object)
40
41 def create_map(self, bam_object):
42 '''
43 Returns a map_dictionary {(chromosome,read_position,polarity):
44 [read_length, ...]}
45 '''
46 map_dictionary = defaultdict(list)
47 # get empty value for start and end of each chromosome
48 for chrom in self.chromosomes:
49 map_dictionary[(chrom, 1, 'F')] = []
50 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
51 for chrom in self.chromosomes:
52 for read in bam_object.fetch(chrom):
53 positions = read.positions # a list of covered positions
54 if read.is_reverse:
55 map_dictionary[(chrom, positions[-1]+1,
56 'R')].append(read.query_alignment_length)
57 else:
58 map_dictionary[(chrom, positions[0]+1,
59 'F')].append(read.query_alignment_length)
60 return map_dictionary
61
62 def signature_tables(self, minquery, maxquery, mintarget, maxtarget):
63 query_range = range(minquery, maxquery + 1)
64 target_range = range(mintarget, maxtarget + 1)
65 Query_table = defaultdict(dict)
66 Target_table = defaultdict(dict)
67 for key in self.map_dict:
68 for size in self.map_dict[key]:
69 if size in query_range or size in target_range:
70 if key[2] == 'F':
71 coordinate = key[1]
72 else:
73 coordinate = -key[1]
74 if size in query_range:
75 Query_table[key[0]][coordinate] = Query_table[key[0]].get(
76 coordinate, 0) + 1
77 if size in target_range:
78 Target_table[key[0]][coordinate] = \
79 Target_table[key[0]].get(coordinate, 0) + 1
80 return Query_table, Target_table
81
82 def search_overlaps(self, minquery, maxquery, mintarget, maxtarget,
83 overlap=10):
84 Query_table, Target_table = self.signature_tables(minquery, maxquery,
85 mintarget, maxtarget)
86 overlap_groups = defaultdict(list)
87 for chrom in Query_table:
88 for coord in Query_table[chrom]:
89 if Target_table[chrom].get(-coord - overlap + 1, 0):
90 overlap_groups[chrom].append(coord)
91 return overlap_groups
92
93 def feed_overlaps(self, overlap_groups, minquery, output, overlap=10):
94 F = open(output, 'w')
95 for chrom in sorted(overlap_groups):
96 for pos in sorted(overlap_groups[chrom]):
97 if pos > 0: # read are forward
98 reads = self.bam_object.fetch(chrom, start=pos-1,
99 end=pos-1+overlap-1)
100 for read in reads:
101 positions = read.positions
102 if pos-1 == positions[0] and \
103 read.query_alignment_length >= minquery:
104 F.write('>%s|%s|%s|%s\n%s\n' % (
105 chrom, pos, 'F',
106 read.query_alignment_length,
107 read.query_sequence))
108 else: # reads are reverse
109 reads = self.bam_object.fetch(chrom,
110 start=-pos-1-overlap+1,
111 end=-pos-1)
112 for read in reads:
113 positions = read.positions
114 if -pos-1 == positions[-1] and \
115 read.query_alignment_length >= minquery:
116 readseq = self.revcomp(read.query_sequence)
117 readsize = read.query_alignment_length
118 F.write('>%s|%s|%s|%s\n%s\n' % (chrom,
119 positions[0] + 1,
120 'R', readsize, readseq))
121 F.close()
122 return
123
124 def revcomp(self, sequence):
125 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
126 revseq = sequence[::-1]
127 return "".join([antidict[i] for i in revseq])
128
129
130 def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10):
131 mapobj = Map(input)
132 mapobj.feed_overlaps(mapobj.search_overlaps(minquery, maxquery,
133 mintarget, maxtarget,
134 overlap), minquery, output)
135
136
137 if __name__ == "__main__":
138 args = Parser()
139 main(args.input, args.minquery, args.maxquery, args.mintarget,
140 args.maxtarget, args.output)