diff overlapping_reads.py @ 3:4d9682bd3a6b draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 96ed5824190aff281cc3aa47dc60fc66aac41db3
author artbio
date Sat, 02 Sep 2017 06:35:15 -0400
parents 6f1378738798
children 20d28cfdeefe
line wrap: on
line diff
--- a/overlapping_reads.py	Wed Aug 30 05:40:18 2017 -0400
+++ b/overlapping_reads.py	Sat Sep 02 06:35:15 2017 -0400
@@ -36,90 +36,106 @@
         self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
         self.chromosomes = dict(zip(self.bam_object.references,
                                 self.bam_object.lengths))
-        self.map_dict = self.create_map(self.bam_object)
+        self.all_query_positions = self.query_positions(self.bam_object)
+        self.readdic = self.make_readdic(self.bam_object)
 
-    def create_map(self, bam_object):
-        '''
-        Returns a map_dictionary {(chromosome,read_position,polarity):
-                                                    [read_length, ...]}
-        '''
-        map_dictionary = defaultdict(list)
-        # get empty value for start and end of each chromosome
-        for chrom in self.chromosomes:
-            map_dictionary[(chrom, 1, 'F')] = []
-            map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
+    def make_readdic(self, bam_object):
+        readdic = defaultdict(int)
+        for read in bam_object.fetch():
+            readdic[read.query_sequence] += 1
+        return readdic
+
+    def query_positions(self, bam_object):
+        all_query_positions = defaultdict(list)
         for chrom in self.chromosomes:
             for read in bam_object.fetch(chrom):
-                positions = read.positions  # a list of covered positions
-                if read.is_reverse:
-                    map_dictionary[(chrom, positions[-1]+1,
-                                    'R')].append(read.query_alignment_length)
+                if not read.is_reverse:
+                    all_query_positions[chrom].append(
+                        read.get_reference_positions(full_length=True)[0])
                 else:
-                    map_dictionary[(chrom, positions[0]+1,
-                                    'F')].append(read.query_alignment_length)
-        return map_dictionary
+                    all_query_positions[chrom].append(
+                        read.get_reference_positions(full_length=True)[-1])
+            all_query_positions[chrom] = sorted(
+                list(set(all_query_positions[chrom])))
+        return all_query_positions
 
-    def signature_tables(self, minquery, maxquery, mintarget, maxtarget):
+    def direct_pairing(self, minquery, maxquery, mintarget, maxtarget,
+                       file, overlap=10):
+        F = open(file, 'w')
         query_range = range(minquery, maxquery + 1)
         target_range = range(mintarget, maxtarget + 1)
-        Query_table = defaultdict(dict)
-        Target_table = defaultdict(dict)
-        for key in self.map_dict:
-            for size in self.map_dict[key]:
-                if size in query_range or size in target_range:
-                    if key[2] == 'F':
-                        coordinate = key[1]
-                    else:
-                        coordinate = -key[1]
-                if size in query_range:
-                    Query_table[key[0]][coordinate] = Query_table[key[0]].get(
-                        coordinate, 0) + 1
-                if size in target_range:
-                    Target_table[key[0]][coordinate] = \
-                        Target_table[key[0]].get(coordinate, 0) + 1
-        return Query_table, Target_table
-
-    def search_overlaps(self, minquery, maxquery, mintarget, maxtarget,
-                        overlap=10):
-        Query_table, Target_table = self.signature_tables(minquery, maxquery,
-                                                          mintarget, maxtarget)
-        overlap_groups = defaultdict(list)
-        for chrom in Query_table:
-            for coord in Query_table[chrom]:
-                if Target_table[chrom].get(-coord - overlap + 1, 0):
-                    overlap_groups[chrom].append(coord)
-        return overlap_groups
-
-    def feed_overlaps(self, overlap_groups, minquery, output, overlap=10):
-        F = open(output, 'w')
-        for chrom in sorted(overlap_groups):
-            for pos in sorted(overlap_groups[chrom]):
-                if pos > 0:  # read are forward
-                    reads = self.bam_object.fetch(chrom, start=pos-1,
-                                                  end=pos-1+overlap-1)
-                    for read in reads:
-                        positions = read.positions
-                        if pos-1 == positions[0] and \
-                                read.query_alignment_length >= minquery:
-                            F.write('>%s|%s|%s|%s\n%s\n' % (
-                                chrom, pos, 'F',
-                                read.query_alignment_length,
-                                read.query_sequence))
-                else:  # reads are reverse
-                    reads = self.bam_object.fetch(chrom,
-                                                  start=-pos-1-overlap+1,
-                                                  end=-pos-1)
-                    for read in reads:
-                        positions = read.positions
-                        if -pos-1 == positions[-1] and \
-                                read.query_alignment_length >= minquery:
-                            readseq = self.revcomp(read.query_sequence)
-                            readsize = read.query_alignment_length
-                            F.write('>%s|%s|%s|%s\n%s\n' % (chrom,
-                                                       positions[0] + 1,
-                                                       'R', readsize, readseq))
-        F.close()
-        return
+        stringresult = []
+        for chrom in sorted(self.chromosomes):
+            for pos in (self.all_query_positions[chrom]):
+                iterreads_1 = self.bam_object.fetch(chrom,
+                                                    start=pos, end=pos+overlap-1)
+                iterreads_2 = self.bam_object.fetch(chrom,
+                                                    start=pos, end=pos+overlap-1)
+                iterreads_3 = self.bam_object.fetch(chrom,
+                                                    start=pos, end=pos+overlap-1)
+                iterreads_4 = self.bam_object.fetch(chrom,
+                                                    start=pos, end=pos+overlap-1)
+                #  1
+                for queryread in iterreads_1:
+                    if queryread.get_reference_positions(
+                        full_length=True)[0] == pos and \
+                        queryread.query_alignment_length in query_range \
+                            and not queryread.is_reverse:
+                        for targetread in iterreads_2:
+                            if (targetread.
+                                get_reference_positions(full_length=True)[-1]
+                                == queryread.get_reference_positions(
+                                   full_length=True)[overlap-1] and
+                                targetread.query_alignment_length in
+                                    target_range and targetread.is_reverse):
+                                targetreadseq = self.revcomp(
+                                    targetread.query_sequence)
+                                stringresult.append(
+                                    '>%s|%s|%s|%s|n=%s\n%s\n' %
+                                    (chrom, queryread.get_reference_positions(
+                                     full_length=True)[0]+1,
+                                     'F', queryread.query_alignment_length,
+                                     self.readdic[queryread.query_sequence],
+                                     queryread.query_sequence))
+                                stringresult.append(
+                                    '>%s|%s|%s|%s|n=%s\n%s\n' %
+                                    (chrom, targetread.get_reference_positions(
+                                     full_length=True)[0]+1,
+                                     'R', targetread.query_alignment_length,
+                                     self.readdic[targetread.query_sequence],
+                                     targetreadseq))
+                #  2
+                for queryread in iterreads_3:
+                    if queryread.get_reference_positions(
+                        full_length=True)[-1] == pos+overlap-1 and \
+                        queryread.query_alignment_length in query_range \
+                            and queryread.is_reverse:
+                        for targetread in iterreads_4:
+                            if (targetread.
+                                get_reference_positions(full_length=True)[0]
+                                == pos and targetread.query_alignment_length
+                                    in target_range and not
+                                    targetread.is_reverse):
+                                queryreadseq = self.revcomp(
+                                    queryread.query_sequence)
+                                targetreadseq = targetread.query_sequence
+                                stringresult.append(
+                                    '>%s|%s|%s|%s|n=%s\n%s\n' %
+                                    (chrom, queryread.get_reference_positions(
+                                     full_length=True)[0]+1, 'R',
+                                     queryread.query_alignment_length,
+                                     self.readdic[queryread.query_sequence],
+                                     queryreadseq))
+                                stringresult.append(
+                                    '>%s|%s|%s|%s|n=%s\n%s\n' %
+                                    (chrom, targetread.get_reference_positions(
+                                     full_length=True)[0]+1,
+                                     'F', targetread.query_alignment_length,
+                                     self.readdic[targetread.query_sequence],
+                                     targetreadseq))
+        stringresult = sorted(set(stringresult),
+                              key=lambda x: stringresult.index(x))
+        F.write(''.join(stringresult))
 
     def revcomp(self, sequence):
         antidict = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
@@ -129,12 +145,11 @@
 
 def main(input, minquery, maxquery, mintarget, maxtarget, output, overlap=10):
     mapobj = Map(input)
-    mapobj.feed_overlaps(mapobj.search_overlaps(minquery, maxquery,
-                                                mintarget, maxtarget,
-                                                overlap), minquery, output)
+    mapobj.direct_pairing(minquery, maxquery, mintarget, maxtarget,
+                          output, overlap)
 
 
 if __name__ == "__main__":
     args = Parser()
     main(args.input, args.minquery, args.maxquery, args.mintarget,
-         args.maxtarget, args.output)
+         args.maxtarget, args.output, args.overlap)