Mercurial > repos > artbio > small_rna_signatures
diff overlapping_reads.py @ 5:a7fd04208764 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 24d44a9b7ec9db4dce3d839b597eea2b1be34adb
author | artbio |
---|---|
date | Sat, 09 Sep 2017 11:57:39 -0400 |
parents | 20d28cfdeefe |
children | 4da23f009c9e |
line wrap: on
line diff
--- a/overlapping_reads.py Fri Sep 08 04:44:22 2017 -0400 +++ b/overlapping_reads.py Sat Sep 09 11:57:39 2017 -0400 @@ -32,12 +32,20 @@ class Map: - def __init__(self, bam_file): + def __init__(self, bam_file, output, minquery=23, maxquery=29, + mintarget=23, maxtarget=29, overlap=10): self.bam_object = pysam.AlignmentFile(bam_file, 'rb') + self.output = output + self.query_range = range(minquery, maxquery + 1) + self.target_range = range(mintarget, maxtarget + 1) + self.overlap = overlap self.chromosomes = dict(zip(self.bam_object.references, self.bam_object.lengths)) - self.all_query_positions = self.query_positions(self.bam_object) + self.alignement_dic = self.index_alignments(self.bam_object) + self.all_query_positions = self.query_positions(self.bam_object, + overlap=self.overlap) self.readdic = self.make_readdic(self.bam_object) + self.pairing() def make_readdic(self, bam_object): readdic = defaultdict(int) @@ -45,90 +53,67 @@ readdic[read.query_sequence] += 1 return readdic - def query_positions(self, bam_object): - all_query_positions = defaultdict(list) + def index_alignments(self, bam_object): + ''' + dic[(chrom, pos, polarity)]: [readseq1, readseq2, ...] + the list value is further converted in set + ''' + dic = defaultdict(list) for chrom in self.chromosomes: for read in bam_object.fetch(chrom): - if not read.is_reverse: - all_query_positions[chrom].append( - read.reference_start) + if read.is_reverse: + coord = read.reference_end-1 + pol = 'R' else: - all_query_positions[chrom].append( - read.reference_end) + coord = read.reference_start + pol = 'F' + dic[(chrom, coord, pol)].append(read.query_sequence) + for key in dic: + dic[key] = set(dic[key]) + return dic + + def query_positions(self, bam_object, overlap): + all_query_positions = defaultdict(list) + for genomicKey in self.alignement_dic.keys(): + chrom, coord, pol = genomicKey + if pol == 'F' and len(self.alignement_dic[(chrom, + coord+overlap-1, + 'R')]) > 0: + all_query_positions[chrom].append(coord) + for chrom in all_query_positions: all_query_positions[chrom] = sorted( list(set(all_query_positions[chrom]))) return all_query_positions - def direct_pairing(self, minquery, maxquery, mintarget, maxtarget, - file, overlap=10): - F = open(file, 'w') - query_range = range(minquery, maxquery + 1) - target_range = range(mintarget, maxtarget + 1) + def pairing(self): + F = open(self.output, 'w') + query_range = self.query_range + target_range = self.target_range + overlap = self.overlap stringresult = [] + header_template = '>%s|coord=%s|strand %s|size=%s|nreads=%s\n%s\n' for chrom in sorted(self.chromosomes): - for pos in (self.all_query_positions[chrom]): - iterreads_1 = self.bam_object.fetch(chrom, - start=pos, end=pos+overlap-1) - iterreads_2 = self.bam_object.fetch(chrom, - start=pos, end=pos+overlap-1) - iterreads_3 = self.bam_object.fetch(chrom, - start=pos, end=pos+overlap-1) - iterreads_4 = self.bam_object.fetch(chrom, - start=pos, end=pos+overlap-1) - # 1 - for queryread in iterreads_1: - if queryread.reference_start == pos and \ - queryread.query_alignment_length in query_range \ - and not queryread.is_reverse: - for targetread in iterreads_2: - if (targetread. - get_reference_positions()[-1] - == queryread.get_reference_positions( - )[overlap-1] and - targetread.query_alignment_length in - target_range and targetread.is_reverse): - targetreadseq = self.revcomp( - targetread.query_sequence) - stringresult.append( - '>%s|%s|%s|%s|n=%s\n%s\n' % - (chrom, queryread.reference_start+1, - 'F', queryread.query_alignment_length, - self.readdic[queryread.query_sequence], - queryread.query_sequence)) - stringresult.append( - '>%s|%s|%s|%s|n=%s\n%s\n' % - (chrom, targetread.reference_start+1, - 'R', targetread.query_alignment_length, - self.readdic[targetread.query_sequence], - targetreadseq)) - # 2 - for queryread in iterreads_3: - if queryread.reference_end-1 == pos+overlap-1 and \ - queryread.query_alignment_length in query_range \ - and queryread.is_reverse: - for targetread in iterreads_4: - if (targetread. - reference_start - == pos and targetread.query_alignment_length - in target_range and not - targetread.is_reverse): - queryreadseq = self.revcomp( - queryread.query_sequence) - targetreadseq = targetread.query_sequence - stringresult.append( - '>%s|%s|%s|%s|n=%s\n%s\n' % - (chrom, queryread.reference_start+1, 'R', - queryread.query_alignment_length, - self.readdic[queryread.query_sequence], - queryreadseq)) - stringresult.append( - '>%s|%s|%s|%s|n=%s\n%s\n' % - (chrom, targetread.reference_start+1, - 'F', targetread.query_alignment_length, - self.readdic[targetread.query_sequence], - targetreadseq)) - stringresult = sorted(set(stringresult), - key=lambda x: stringresult.index(x)) + for pos in self.all_query_positions[chrom]: + stringbuffer = [] + uppers = self.alignement_dic[chrom, pos, 'F'] + lowers = self.alignement_dic[chrom, pos+overlap-1, 'R'] + if uppers and lowers: + for upread in uppers: + for downread in lowers: + if (len(upread) in query_range and len(downread) in + target_range) or (len(upread) in target_range + and len(downread) in + query_range): + stringbuffer.append( + header_template % + (chrom, pos+1, '+', len(upread), + self.readdic[upread], upread)) + stringbuffer.append( + header_template % + (chrom, pos+overlap-len(downread)+1, '-', + len(downread), self.readdic[downread], + self.revcomp(downread))) + stringresult.extend(sorted(set(stringbuffer))) F.write(''.join(stringresult)) def revcomp(self, sequence): @@ -137,13 +122,7 @@ return "".join([antidict[i] for i in revseq]) -def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10): - mapobj = Map(input) - mapobj.direct_pairing(minquery, maxquery, mintarget, maxtarget, - output, overlap) - - if __name__ == "__main__": args = Parser() - main(args.input, args.minquery, args.maxquery, args.mintarget, - args.maxtarget, args.output, args.overlap) + mapobj = Map(args.input, args.output, args.minquery, args.maxquery, + args.mintarget, args.maxtarget, args.overlap)