Mercurial > repos > artbio > small_rna_signatures
comparison overlapping_reads.py @ 5:a7fd04208764 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 24d44a9b7ec9db4dce3d839b597eea2b1be34adb
author | artbio |
---|---|
date | Sat, 09 Sep 2017 11:57:39 -0400 |
parents | 20d28cfdeefe |
children | 4da23f009c9e |
comparison
equal
deleted
inserted
replaced
4:20d28cfdeefe | 5:a7fd04208764 |
---|---|
30 return args | 30 return args |
31 | 31 |
32 | 32 |
33 class Map: | 33 class Map: |
34 | 34 |
35 def __init__(self, bam_file): | 35 def __init__(self, bam_file, output, minquery=23, maxquery=29, |
36 mintarget=23, maxtarget=29, overlap=10): | |
36 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') | 37 self.bam_object = pysam.AlignmentFile(bam_file, 'rb') |
38 self.output = output | |
39 self.query_range = range(minquery, maxquery + 1) | |
40 self.target_range = range(mintarget, maxtarget + 1) | |
41 self.overlap = overlap | |
37 self.chromosomes = dict(zip(self.bam_object.references, | 42 self.chromosomes = dict(zip(self.bam_object.references, |
38 self.bam_object.lengths)) | 43 self.bam_object.lengths)) |
39 self.all_query_positions = self.query_positions(self.bam_object) | 44 self.alignement_dic = self.index_alignments(self.bam_object) |
45 self.all_query_positions = self.query_positions(self.bam_object, | |
46 overlap=self.overlap) | |
40 self.readdic = self.make_readdic(self.bam_object) | 47 self.readdic = self.make_readdic(self.bam_object) |
48 self.pairing() | |
41 | 49 |
42 def make_readdic(self, bam_object): | 50 def make_readdic(self, bam_object): |
43 readdic = defaultdict(int) | 51 readdic = defaultdict(int) |
44 for read in bam_object.fetch(): | 52 for read in bam_object.fetch(): |
45 readdic[read.query_sequence] += 1 | 53 readdic[read.query_sequence] += 1 |
46 return readdic | 54 return readdic |
47 | 55 |
48 def query_positions(self, bam_object): | 56 def index_alignments(self, bam_object): |
49 all_query_positions = defaultdict(list) | 57 ''' |
58 dic[(chrom, pos, polarity)]: [readseq1, readseq2, ...] | |
59 the list value is further converted in set | |
60 ''' | |
61 dic = defaultdict(list) | |
50 for chrom in self.chromosomes: | 62 for chrom in self.chromosomes: |
51 for read in bam_object.fetch(chrom): | 63 for read in bam_object.fetch(chrom): |
52 if not read.is_reverse: | 64 if read.is_reverse: |
53 all_query_positions[chrom].append( | 65 coord = read.reference_end-1 |
54 read.reference_start) | 66 pol = 'R' |
55 else: | 67 else: |
56 all_query_positions[chrom].append( | 68 coord = read.reference_start |
57 read.reference_end) | 69 pol = 'F' |
70 dic[(chrom, coord, pol)].append(read.query_sequence) | |
71 for key in dic: | |
72 dic[key] = set(dic[key]) | |
73 return dic | |
74 | |
75 def query_positions(self, bam_object, overlap): | |
76 all_query_positions = defaultdict(list) | |
77 for genomicKey in self.alignement_dic.keys(): | |
78 chrom, coord, pol = genomicKey | |
79 if pol == 'F' and len(self.alignement_dic[(chrom, | |
80 coord+overlap-1, | |
81 'R')]) > 0: | |
82 all_query_positions[chrom].append(coord) | |
83 for chrom in all_query_positions: | |
58 all_query_positions[chrom] = sorted( | 84 all_query_positions[chrom] = sorted( |
59 list(set(all_query_positions[chrom]))) | 85 list(set(all_query_positions[chrom]))) |
60 return all_query_positions | 86 return all_query_positions |
61 | 87 |
62 def direct_pairing(self, minquery, maxquery, mintarget, maxtarget, | 88 def pairing(self): |
63 file, overlap=10): | 89 F = open(self.output, 'w') |
64 F = open(file, 'w') | 90 query_range = self.query_range |
65 query_range = range(minquery, maxquery + 1) | 91 target_range = self.target_range |
66 target_range = range(mintarget, maxtarget + 1) | 92 overlap = self.overlap |
67 stringresult = [] | 93 stringresult = [] |
94 header_template = '>%s|coord=%s|strand %s|size=%s|nreads=%s\n%s\n' | |
68 for chrom in sorted(self.chromosomes): | 95 for chrom in sorted(self.chromosomes): |
69 for pos in (self.all_query_positions[chrom]): | 96 for pos in self.all_query_positions[chrom]: |
70 iterreads_1 = self.bam_object.fetch(chrom, | 97 stringbuffer = [] |
71 start=pos, end=pos+overlap-1) | 98 uppers = self.alignement_dic[chrom, pos, 'F'] |
72 iterreads_2 = self.bam_object.fetch(chrom, | 99 lowers = self.alignement_dic[chrom, pos+overlap-1, 'R'] |
73 start=pos, end=pos+overlap-1) | 100 if uppers and lowers: |
74 iterreads_3 = self.bam_object.fetch(chrom, | 101 for upread in uppers: |
75 start=pos, end=pos+overlap-1) | 102 for downread in lowers: |
76 iterreads_4 = self.bam_object.fetch(chrom, | 103 if (len(upread) in query_range and len(downread) in |
77 start=pos, end=pos+overlap-1) | 104 target_range) or (len(upread) in target_range |
78 # 1 | 105 and len(downread) in |
79 for queryread in iterreads_1: | 106 query_range): |
80 if queryread.reference_start == pos and \ | 107 stringbuffer.append( |
81 queryread.query_alignment_length in query_range \ | 108 header_template % |
82 and not queryread.is_reverse: | 109 (chrom, pos+1, '+', len(upread), |
83 for targetread in iterreads_2: | 110 self.readdic[upread], upread)) |
84 if (targetread. | 111 stringbuffer.append( |
85 get_reference_positions()[-1] | 112 header_template % |
86 == queryread.get_reference_positions( | 113 (chrom, pos+overlap-len(downread)+1, '-', |
87 )[overlap-1] and | 114 len(downread), self.readdic[downread], |
88 targetread.query_alignment_length in | 115 self.revcomp(downread))) |
89 target_range and targetread.is_reverse): | 116 stringresult.extend(sorted(set(stringbuffer))) |
90 targetreadseq = self.revcomp( | |
91 targetread.query_sequence) | |
92 stringresult.append( | |
93 '>%s|%s|%s|%s|n=%s\n%s\n' % | |
94 (chrom, queryread.reference_start+1, | |
95 'F', queryread.query_alignment_length, | |
96 self.readdic[queryread.query_sequence], | |
97 queryread.query_sequence)) | |
98 stringresult.append( | |
99 '>%s|%s|%s|%s|n=%s\n%s\n' % | |
100 (chrom, targetread.reference_start+1, | |
101 'R', targetread.query_alignment_length, | |
102 self.readdic[targetread.query_sequence], | |
103 targetreadseq)) | |
104 # 2 | |
105 for queryread in iterreads_3: | |
106 if queryread.reference_end-1 == pos+overlap-1 and \ | |
107 queryread.query_alignment_length in query_range \ | |
108 and queryread.is_reverse: | |
109 for targetread in iterreads_4: | |
110 if (targetread. | |
111 reference_start | |
112 == pos and targetread.query_alignment_length | |
113 in target_range and not | |
114 targetread.is_reverse): | |
115 queryreadseq = self.revcomp( | |
116 queryread.query_sequence) | |
117 targetreadseq = targetread.query_sequence | |
118 stringresult.append( | |
119 '>%s|%s|%s|%s|n=%s\n%s\n' % | |
120 (chrom, queryread.reference_start+1, 'R', | |
121 queryread.query_alignment_length, | |
122 self.readdic[queryread.query_sequence], | |
123 queryreadseq)) | |
124 stringresult.append( | |
125 '>%s|%s|%s|%s|n=%s\n%s\n' % | |
126 (chrom, targetread.reference_start+1, | |
127 'F', targetread.query_alignment_length, | |
128 self.readdic[targetread.query_sequence], | |
129 targetreadseq)) | |
130 stringresult = sorted(set(stringresult), | |
131 key=lambda x: stringresult.index(x)) | |
132 F.write(''.join(stringresult)) | 117 F.write(''.join(stringresult)) |
133 | 118 |
134 def revcomp(self, sequence): | 119 def revcomp(self, sequence): |
135 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} | 120 antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"} |
136 revseq = sequence[::-1] | 121 revseq = sequence[::-1] |
137 return "".join([antidict[i] for i in revseq]) | 122 return "".join([antidict[i] for i in revseq]) |
138 | 123 |
139 | 124 |
140 def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10): | |
141 mapobj = Map(input) | |
142 mapobj.direct_pairing(minquery, maxquery, mintarget, maxtarget, | |
143 output, overlap) | |
144 | |
145 | |
146 if __name__ == "__main__": | 125 if __name__ == "__main__": |
147 args = Parser() | 126 args = Parser() |
148 main(args.input, args.minquery, args.maxquery, args.mintarget, | 127 mapobj = Map(args.input, args.output, args.minquery, args.maxquery, |
149 args.maxtarget, args.output, args.overlap) | 128 args.mintarget, args.maxtarget, args.overlap) |