comparison overlapping_reads.py @ 5:a7fd04208764 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 24d44a9b7ec9db4dce3d839b597eea2b1be34adb
author artbio
date Sat, 09 Sep 2017 11:57:39 -0400
parents 20d28cfdeefe
children 4da23f009c9e
comparison
equal deleted inserted replaced
4:20d28cfdeefe 5:a7fd04208764
30 return args 30 return args
31 31
32 32
33 class Map: 33 class Map:
34 34
35 def __init__(self, bam_file): 35 def __init__(self, bam_file, output, minquery=23, maxquery=29,
36 mintarget=23, maxtarget=29, overlap=10):
36 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') 37 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
38 self.output = output
39 self.query_range = range(minquery, maxquery + 1)
40 self.target_range = range(mintarget, maxtarget + 1)
41 self.overlap = overlap
37 self.chromosomes = dict(zip(self.bam_object.references, 42 self.chromosomes = dict(zip(self.bam_object.references,
38 self.bam_object.lengths)) 43 self.bam_object.lengths))
39 self.all_query_positions = self.query_positions(self.bam_object) 44 self.alignement_dic = self.index_alignments(self.bam_object)
45 self.all_query_positions = self.query_positions(self.bam_object,
46 overlap=self.overlap)
40 self.readdic = self.make_readdic(self.bam_object) 47 self.readdic = self.make_readdic(self.bam_object)
48 self.pairing()
41 49
42 def make_readdic(self, bam_object): 50 def make_readdic(self, bam_object):
43 readdic = defaultdict(int) 51 readdic = defaultdict(int)
44 for read in bam_object.fetch(): 52 for read in bam_object.fetch():
45 readdic[read.query_sequence] += 1 53 readdic[read.query_sequence] += 1
46 return readdic 54 return readdic
47 55
48 def query_positions(self, bam_object): 56 def index_alignments(self, bam_object):
49 all_query_positions = defaultdict(list) 57 '''
58 dic[(chrom, pos, polarity)]: [readseq1, readseq2, ...]
59 the list value is further converted in set
60 '''
61 dic = defaultdict(list)
50 for chrom in self.chromosomes: 62 for chrom in self.chromosomes:
51 for read in bam_object.fetch(chrom): 63 for read in bam_object.fetch(chrom):
52 if not read.is_reverse: 64 if read.is_reverse:
53 all_query_positions[chrom].append( 65 coord = read.reference_end-1
54 read.reference_start) 66 pol = 'R'
55 else: 67 else:
56 all_query_positions[chrom].append( 68 coord = read.reference_start
57 read.reference_end) 69 pol = 'F'
70 dic[(chrom, coord, pol)].append(read.query_sequence)
71 for key in dic:
72 dic[key] = set(dic[key])
73 return dic
74
75 def query_positions(self, bam_object, overlap):
76 all_query_positions = defaultdict(list)
77 for genomicKey in self.alignement_dic.keys():
78 chrom, coord, pol = genomicKey
79 if pol == 'F' and len(self.alignement_dic[(chrom,
80 coord+overlap-1,
81 'R')]) > 0:
82 all_query_positions[chrom].append(coord)
83 for chrom in all_query_positions:
58 all_query_positions[chrom] = sorted( 84 all_query_positions[chrom] = sorted(
59 list(set(all_query_positions[chrom]))) 85 list(set(all_query_positions[chrom])))
60 return all_query_positions 86 return all_query_positions
61 87
62 def direct_pairing(self, minquery, maxquery, mintarget, maxtarget, 88 def pairing(self):
63 file, overlap=10): 89 F = open(self.output, 'w')
64 F = open(file, 'w') 90 query_range = self.query_range
65 query_range = range(minquery, maxquery + 1) 91 target_range = self.target_range
66 target_range = range(mintarget, maxtarget + 1) 92 overlap = self.overlap
67 stringresult = [] 93 stringresult = []
94 header_template = '>%s|coord=%s|strand %s|size=%s|nreads=%s\n%s\n'
68 for chrom in sorted(self.chromosomes): 95 for chrom in sorted(self.chromosomes):
69 for pos in (self.all_query_positions[chrom]): 96 for pos in self.all_query_positions[chrom]:
70 iterreads_1 = self.bam_object.fetch(chrom, 97 stringbuffer = []
71 start=pos, end=pos+overlap-1) 98 uppers = self.alignement_dic[chrom, pos, 'F']
72 iterreads_2 = self.bam_object.fetch(chrom, 99 lowers = self.alignement_dic[chrom, pos+overlap-1, 'R']
73 start=pos, end=pos+overlap-1) 100 if uppers and lowers:
74 iterreads_3 = self.bam_object.fetch(chrom, 101 for upread in uppers:
75 start=pos, end=pos+overlap-1) 102 for downread in lowers:
76 iterreads_4 = self.bam_object.fetch(chrom, 103 if (len(upread) in query_range and len(downread) in
77 start=pos, end=pos+overlap-1) 104 target_range) or (len(upread) in target_range
78 # 1 105 and len(downread) in
79 for queryread in iterreads_1: 106 query_range):
80 if queryread.reference_start == pos and \ 107 stringbuffer.append(
81 queryread.query_alignment_length in query_range \ 108 header_template %
82 and not queryread.is_reverse: 109 (chrom, pos+1, '+', len(upread),
83 for targetread in iterreads_2: 110 self.readdic[upread], upread))
84 if (targetread. 111 stringbuffer.append(
85 get_reference_positions()[-1] 112 header_template %
86 == queryread.get_reference_positions( 113 (chrom, pos+overlap-len(downread)+1, '-',
87 )[overlap-1] and 114 len(downread), self.readdic[downread],
88 targetread.query_alignment_length in 115 self.revcomp(downread)))
89 target_range and targetread.is_reverse): 116 stringresult.extend(sorted(set(stringbuffer)))
90 targetreadseq = self.revcomp(
91 targetread.query_sequence)
92 stringresult.append(
93 '>%s|%s|%s|%s|n=%s\n%s\n' %
94 (chrom, queryread.reference_start+1,
95 'F', queryread.query_alignment_length,
96 self.readdic[queryread.query_sequence],
97 queryread.query_sequence))
98 stringresult.append(
99 '>%s|%s|%s|%s|n=%s\n%s\n' %
100 (chrom, targetread.reference_start+1,
101 'R', targetread.query_alignment_length,
102 self.readdic[targetread.query_sequence],
103 targetreadseq))
104 # 2
105 for queryread in iterreads_3:
106 if queryread.reference_end-1 == pos+overlap-1 and \
107 queryread.query_alignment_length in query_range \
108 and queryread.is_reverse:
109 for targetread in iterreads_4:
110 if (targetread.
111 reference_start
112 == pos and targetread.query_alignment_length
113 in target_range and not
114 targetread.is_reverse):
115 queryreadseq = self.revcomp(
116 queryread.query_sequence)
117 targetreadseq = targetread.query_sequence
118 stringresult.append(
119 '>%s|%s|%s|%s|n=%s\n%s\n' %
120 (chrom, queryread.reference_start+1, 'R',
121 queryread.query_alignment_length,
122 self.readdic[queryread.query_sequence],
123 queryreadseq))
124 stringresult.append(
125 '>%s|%s|%s|%s|n=%s\n%s\n' %
126 (chrom, targetread.reference_start+1,
127 'F', targetread.query_alignment_length,
128 self.readdic[targetread.query_sequence],
129 targetreadseq))
130 stringresult = sorted(set(stringresult),
131 key=lambda x: stringresult.index(x))
132 F.write(''.join(stringresult)) 117 F.write(''.join(stringresult))
133 118
134 def revcomp(self, sequence): 119 def revcomp(self, sequence):
135 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} 120 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
136 revseq = sequence[::-1] 121 revseq = sequence[::-1]
137 return "".join([antidict[i] for i in revseq]) 122 return "".join([antidict[i] for i in revseq])
138 123
139 124
140 def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10):
141 mapobj = Map(input)
142 mapobj.direct_pairing(minquery, maxquery, mintarget, maxtarget,
143 output, overlap)
144
145
146 if __name__ == "__main__": 125 if __name__ == "__main__":
147 args = Parser() 126 args = Parser()
148 main(args.input, args.minquery, args.maxquery, args.mintarget, 127 mapobj = Map(args.input, args.output, args.minquery, args.maxquery,
149 args.maxtarget, args.output, args.overlap) 128 args.mintarget, args.maxtarget, args.overlap)