changeset 7:07771982ef9b draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_signatures commit 7276b6b73aef7af4058ad2c1e34c4557e9cccbe0
author artbio
date Sun, 10 Sep 2017 13:50:40 -0400
parents 4da23f009c9e
children 5c4504825d74
files overlapping_reads.py overlapping_reads.xml signature.py signature.xml
diffstat 4 files changed, 102 insertions(+), 65 deletions(-) [+]
line wrap: on
line diff
--- a/overlapping_reads.py	Sun Sep 10 10:27:19 2017 -0400
+++ b/overlapping_reads.py	Sun Sep 10 13:50:40 2017 -0400
@@ -88,13 +88,17 @@
     def countpairs(self, uppers, lowers):
         query_range = self.query_range
         target_range = self.target_range
-        uppers = [seq for seq in uppers if (len(seq) in query_range or len(seq) in target_range)]
+        uppers = [seq for seq in uppers if (len(seq) in query_range or len(seq)
+                  in target_range)]
+        print(uppers)
         uppers_expanded = []
         for seq in uppers:
             expand = [seq for i in range(self.readdic[seq])]
             uppers_expanded.extend(expand)
+        print(uppers_expanded)
         uppers = uppers_expanded
-        lowers = [seq for seq in lowers if (len(seq) in query_range or len(seq) in target_range)]
+        lowers = [seq for seq in lowers if (len(seq) in query_range or len(seq)
+                  in target_range)]
         lowers_expanded = []
         for seq in lowers:
             expand = [seq for i in range(self.readdic[seq])]
@@ -119,7 +123,7 @@
         stringresult = []
         header_template = '>%s|coord=%s|strand %s|size=%s|nreads=%s\n%s\n'
         total_pairs = 0
-        print ('Chromosome\tNbre of pairs')
+        print('Chromosome\tNbre of pairs')
         for chrom in sorted(self.chromosomes):
             number_pairs = 0
             for pos in self.all_query_positions[chrom]:
@@ -146,7 +150,8 @@
                                      self.revcomp(downread)))
                 stringresult.extend(sorted(set(stringbuffer)))
             print('%s\t%s' % (chrom, number_pairs))
-        print('Total nbre of pairs that can be simultaneously formed\t%s' % total_pairs)
+        print('Total nbre of pairs that can be simultaneously formed\t%s'
+              % total_pairs)
         F.write(''.join(stringresult))
 
     def revcomp(self, sequence):
--- a/overlapping_reads.xml	Sun Sep 10 10:27:19 2017 -0400
+++ b/overlapping_reads.xml	Sun Sep 10 13:50:40 2017 -0400
@@ -120,6 +120,11 @@
 the second sequence in this example corresponds to 1 read that overlap by 10 nt with
 1 read of the first sequence.
 
+The tool also returns in the standard output the numbers of pairs of reads that
+can be formed simultaneously in silico. Note that these numbers are distinct from the numbers
+of pairs of read alignments (as computed by the small_rna_signature tool) when analysis is
+performed with multi-mapping reads.
+
         </help>
     <citations>
             <citation type="doi">10.1007/978-1-4939-0931-5_12</citation>
--- a/signature.py	Sun Sep 10 10:27:19 2017 -0400
+++ b/signature.py	Sun Sep 10 13:50:40 2017 -0400
@@ -40,11 +40,23 @@
 
 class Map:
 
-    def __init__(self, bam_file):
+    def __init__(self, bam_file, minquery=23, maxquery=29, mintarget=23,
+                 maxtarget=29, minscope=1, maxscope=19, output_h='',
+                 output_z=''):
         self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
+        self.query_range = range(minquery, maxquery + 1)
+        self.target_range = range(mintarget, maxtarget + 1)
+        self.scope = range(minscope, maxscope + 1)
+        self.H = open(output_h, 'w')
+        self.Z = open(output_z, 'w')
         self.chromosomes = dict(zip(self.bam_object.references,
                                 self.bam_object.lengths))
         self.map_dict = self.create_map(self.bam_object)
+        self.query_positions = self.compute_query_positions()
+        self.Z.write(self.compute_signature_pairs())
+        self.H.write(self.compute_signature_h())
+        self.H.close()
+        self.Z.close()
 
     def create_map(self, bam_object):
         '''
@@ -58,18 +70,73 @@
             map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
         for chrom in self.chromosomes:
             for read in bam_object.fetch(chrom):
-                positions = read.positions  # a list of covered positions
                 if read.is_reverse:
-                    map_dictionary[(chrom, positions[-1]+1,
+                    map_dictionary[(chrom, read.reference_end,
                                     'R')].append(read.query_alignment_length)
                 else:
-                    map_dictionary[(chrom, positions[0]+1,
+                    map_dictionary[(chrom, read.reference_start+1,
                                     'F')].append(read.query_alignment_length)
         return map_dictionary
 
-    def signature_tables(self, minquery, maxquery, mintarget, maxtarget):
-        query_range = range(minquery, maxquery + 1)
-        target_range = range(mintarget, maxtarget + 1)
+    def compute_query_positions(self):
+        ''' this method does not filter on read size, just forward reads
+        that overlap reverse reads in the overlap range'''
+        all_query_positions = defaultdict(list)
+        for genomicKey in self.map_dict.keys():
+            chrom, coord, pol = genomicKey
+            for i in self.scope:
+                if pol == 'F' and len(self.map_dict[chrom,
+                                                    coord+i-1,
+                                                    'R']) > 0:
+                    all_query_positions[chrom].append(coord)
+                    break
+        for chrom in all_query_positions:
+            all_query_positions[chrom] = sorted(
+                list(set(all_query_positions[chrom])))
+        return all_query_positions
+
+    def countpairs(self, uppers, lowers):
+        query_range = self.query_range
+        target_range = self.target_range
+        uppers = [size for size in uppers if size in query_range or size in
+                  target_range]
+        lowers = [size for size in lowers if size in query_range or size in
+                  target_range]
+        paired = []
+        for upread in uppers:
+            for downread in lowers:
+                if (upread in query_range and downread in target_range) or (
+                        upread in target_range and downread in query_range):
+                    paired.append(upread)
+                    lowers.remove(downread)
+                    break
+        return len(paired)
+
+    def compute_signature_pairs(self):
+        frequency_table = defaultdict(dict)
+        scope = self.scope
+        for chrom in self.chromosomes:
+            for overlap in scope:
+                frequency_table[chrom][overlap] = 0
+        for chrom in self.query_positions:
+            for coord in self.query_positions[chrom]:
+                for overlap in scope:
+                    uppers = self.map_dict[chrom, coord, 'F']
+                    lowers = self.map_dict[chrom, coord+overlap-1, 'R']
+                    frequency_table[chrom][overlap] += self.countpairs(uppers,
+                                                                       lowers)
+        # compute overlaps for all chromosomes merged
+        for overlap in scope:
+            accumulator = []
+            for chrom in frequency_table:
+                if chrom != 'all_chromosomes':
+                    accumulator.append(frequency_table[chrom][overlap])
+            frequency_table['all_chromosomes'][overlap] = sum(accumulator)
+        return self.stringify_table(frequency_table)
+
+    def signature_tables(self):
+        query_range = self.query_range
+        target_range = self.target_range
         Query_table = defaultdict(dict)
         Target_table = defaultdict(dict)
         for key in self.map_dict:
@@ -87,39 +154,9 @@
                         Target_table[key[0]].get(coordinate, 0) + 1
         return Query_table, Target_table
 
-    def compute_signature_z(self, minquery, maxquery, mintarget, maxtarget,
-                            scope, zscore="no"):
-        Query_table, Target_table = self.signature_tables(minquery, maxquery,
-                                                     mintarget, maxtarget)
-        frequency_table = defaultdict(dict)
-        for chrom in self.chromosomes:
-            for overlap in scope:
-                frequency_table[chrom][overlap] = 0
-        for chrom in Query_table:
-            for coord in Query_table[chrom]:
-                for overlap in scope:
-                    frequency_table[chrom][overlap] += min(
-                        Query_table[chrom][coord],
-                        Target_table[chrom].get(-coord - overlap + 1, 0))
-        # since we want the number of pairs, not the number or paired reads
-        # to do: what in complex cases
-        # with query and target sizes partially overlap ?
-        for chrom in frequency_table:
-            for overlap in frequency_table[chrom]:
-                frequency_table[chrom][overlap] /= 2
-        # compute overlaps for all chromosomes merged
-        for overlap in scope:
-            accumulator = []
-            for chrom in frequency_table:
-                if chrom != 'all_chromosomes':
-                    accumulator.append(frequency_table[chrom][overlap])
-            frequency_table['all_chromosomes'][overlap] = sum(accumulator)
-        return self.stringify_table(frequency_table)
-
-    def compute_signature_h(self, minquery, maxquery, mintarget,
-                                   maxtarget, scope):
-        Query_table, Target_table = self.signature_tables(minquery, maxquery,
-                                                     mintarget, maxtarget)
+    def compute_signature_h(self):
+        scope = self.scope
+        Query_table, Target_table = self.signature_tables()
         frequency_table = defaultdict(dict)
         for chrom in self.chromosomes:
             for overlap in scope:
@@ -187,23 +224,8 @@
         return ''.join(tablestring)
 
 
-
-def main(input, minquery, maxquery, mintarget, maxtarget, minscope, maxscope,
-         output_h, output_z, genome_wide=False, zscore="no"):
-    H = open(output_h, 'w')
-    Z = open(output_z, 'w')
-    mapobj = Map(input)
-    scope = range(minscope, maxscope + 1)
-    Z.write(mapobj.compute_signature_z(minquery, maxquery, mintarget,
-            maxtarget, scope, zscore="no"))
-    H.write(mapobj.compute_signature_h(minquery, maxquery, mintarget,
-            maxtarget, scope))
-    H.close()
-    Z.close()
-
-
 if __name__ == "__main__":
     args = Parser()
-    main(args.input, args.minquery, args.maxquery, args.mintarget,
-         args.maxtarget, args.minscope, args.maxscope, args.output_h,
-         args.output_z)
+    mapobj = Map(args.input, args.minquery, args.maxquery, args.mintarget,
+                 args.maxtarget, args.minscope, args.maxscope, args.output_h,
+                 args.output_z)
--- a/signature.xml	Sun Sep 10 10:27:19 2017 -0400
+++ b/signature.xml	Sun Sep 10 13:50:40 2017 -0400
@@ -1,4 +1,4 @@
-<tool id="signature" name="Small RNA Signatures" version="3.0.0">
+<tool id="signature" name="Small RNA Signatures" version="3.1.0">
     <description />
     <requirements>
         <requirement type="package" version="1.11.2=py27_0">numpy</requirement>
@@ -89,8 +89,13 @@
 
 Compute small RNA (piRNA, siRNA, ...) signatures.
 
-This tool computes (i) the number of pairs by overlap classes (in nt) and associated the z-scores,
-and (ii) the ping-pong signal (`Brennecke et al, 2009`_) and associated z-scores. 
+This tool computes (i) the number of pairs **aligned** reads by overlap classes (in nt) and associated the z-scores,
+and (ii) the ping-pong signal (`Brennecke et al, 2009`_) and associated z-scores.
+
+**Note** that the number of pairs of aligned reads is disctint from the number of pairs of reads
+when these reads can be aligned at multiple positions in the genome. The two values are equal only
+when the analysis is restricted to uniquely mapping reads.
+
 Options set the min and max size of both the query small rna class and the target small rna class, 
 the range over which to compute the signatures, and whether the signatures should be reported at 
 genome-wide level or by item (chromosomes, genes, etc.). For details on computational algorithmes