annotate small_rna_clusters.py @ 0:8028521b6e4f draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
author artbio
date Mon, 07 Oct 2019 12:51:25 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
1 import argparse
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
2 from collections import defaultdict
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
3
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
4 import pysam
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
5
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
6
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
7 def Parser():
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
8 the_parser = argparse.ArgumentParser()
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
9 the_parser.add_argument('--inputs', dest='inputs', required=True,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
10 nargs='+', help='list of input BAM files')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
11 the_parser.add_argument('--minsize', dest='minsize', type=int,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
12 default=19, help='minimal size of reads')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
13 the_parser.add_argument('--maxsize', dest='maxsize', type=int,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
14 default=29, help='maximal size of reads')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
15 the_parser.add_argument('--cluster', dest='cluster', type=int,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
16 default=0, help='clustering distance')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
17 the_parser.add_argument('--sample_names', dest='sample_names',
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
18 required=True, nargs='+',
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
19 help='list of sample names')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
20 the_parser.add_argument('--bed', dest='bed', required=False,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
21 help='Name of bed output must be specified\
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
22 if --cluster option used')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
23 the_parser.add_argument('--bed_skipsize', dest='bed_skipsize',
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
24 required=False, type=int, default=1,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
25 help='Skip clusters of size equal or less than\
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
26 specified integer in the bed output. \
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
27 Default = 0, not skipping')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
28 the_parser.add_argument('--bed_skipdensity', dest='bed_skipdensity',
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
29 required=False, type=float, default=0,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
30 help='Skip clusters of density equal or less than\
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
31 specified float number in the bed output. \
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
32 Default = 0, not skipping')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
33 the_parser.add_argument('--bed_skipcounts', dest='bed_skipcounts',
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
34 required=False, type=int, default=1,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
35 help='Skip clusters of size equal or less than\
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
36 specified integer in the bed output. \
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
37 Default = 0, not skipping')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
38 the_parser.add_argument('--outputs', action='store',
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
39 help='list of two output paths (only two)')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
40 the_parser.add_argument('--nostrand', action='store_true',
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
41 help='Consider reads regardless their polarity')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
42
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
43 args = the_parser.parse_args()
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
44 return args
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
45
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
46
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
47 class Map:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
48
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
49 def __init__(self, bam_file, sample, minsize, maxsize, cluster, nostrand):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
50 self.sample_name = sample
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
51 self.minsize = minsize
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
52 self.maxsize = maxsize
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
53 self.cluster = cluster
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
54 if not nostrand:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
55 self.nostrand = False
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
56 else:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
57 self.nostrand = True
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
58 self.bam_object = pysam.AlignmentFile(bam_file, 'rb')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
59 self.chromosomes = dict(zip(self.bam_object.references,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
60 self.bam_object.lengths))
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
61 self.map_dict = self.create_map(self.bam_object, self.nostrand)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
62 if self.cluster:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
63 self.map_dict = self.tile_map(self.map_dict, self.cluster)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
64
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
65 def create_map(self, bam_object, nostrand=False):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
66 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
67 Returns a map_dictionary {(chromosome,read_position,polarity):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
68 [read_length, ...]}
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
69 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
70 map_dictionary = defaultdict(list)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
71 for chrom in self.chromosomes:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
72 # get empty value for start and end of each chromosome
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
73 map_dictionary[(chrom, 1, 'F')] = []
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
74 map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = []
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
75 if not nostrand:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
76 for read in bam_object.fetch(chrom):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
77 positions = read.positions # a list of covered positions
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
78 if read.is_reverse:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
79 map_dictionary[(chrom, positions[-1]+1, 'R')].append(
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
80 read.query_alignment_length)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
81 else:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
82 map_dictionary[(chrom, positions[0]+1, 'F')].append(
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
83 read.query_alignment_length)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
84 else:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
85 for read in bam_object.fetch(chrom):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
86 positions = read.positions # a list of covered positions
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
87 map_dictionary[(chrom, positions[0]+1, 'F')].append(
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
88 read.query_alignment_length)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
89 return map_dictionary
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
90
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
91 def grouper(self, iterable, clust_distance):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
92 prev = None
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
93 group = []
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
94 for item in iterable:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
95 if not prev or item - prev <= clust_distance:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
96 group.append(item)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
97 else:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
98 yield group
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
99 group = [item]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
100 prev = item
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
101 if group:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
102 yield group
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
103
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
104 def tile_map(self, map_dic, clust_distance):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
105 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
106 takes a map_dictionary {(chromosome,read_position,polarity):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
107 [read_length, ...]}
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
108 and returns a map_dictionary with structure:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
109 {(chromosome,read_position,polarity):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
110 [*counts*, [start_clust, end_clust]]}
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
111 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
112 clustered_dic = defaultdict(list)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
113 for chrom in self.chromosomes:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
114 F_chrom_coord = []
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
115 R_chrom_coord = []
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
116 for key in map_dic:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
117 if key[0] == chrom and key[2] == 'F':
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
118 F_chrom_coord.append(key[1])
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
119 elif key[0] == chrom and key[2] == 'R':
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
120 R_chrom_coord.append(key[1])
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
121 F_chrom_coord = list(set(F_chrom_coord))
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
122 R_chrom_coord = list(set(R_chrom_coord))
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
123 F_chrom_coord.sort()
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
124 R_chrom_coord.sort()
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
125 F_clust_values = [i for i in self.grouper(F_chrom_coord,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
126 clust_distance)]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
127 F_clust_keys = [(i[-1]+i[0])/2 for i in F_clust_values]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
128 R_clust_values = [i for i in self.grouper(R_chrom_coord,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
129 clust_distance)]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
130 R_clust_keys = [(i[-1]+i[0])/2 for i in R_clust_values]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
131 # now 2 dictionnaries (F and R) with structure:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
132 # {centered_coordinate: [coord1, coord2, coord3, ..]}
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
133 F_clust_dic = dict(zip(F_clust_keys, F_clust_values))
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
134 R_clust_dic = dict(zip(R_clust_keys, R_clust_values))
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
135 for centcoor in F_clust_dic:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
136 accumulator = []
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
137 for coor in F_clust_dic[centcoor]:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
138 accumulator.extend(map_dic[(chrom, coor, 'F')])
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
139 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
140 compute the offset of the cluster due to
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
141 size of reads
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
142 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
143 last = sorted(F_clust_dic[centcoor])[-1]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
144 try:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
145 margin = max(map_dic[(chrom, last, 'F')]) - 1
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
146 except ValueError:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
147 margin = 0
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
148 clustered_dic[(chrom, centcoor, 'F')] = [len(accumulator), [
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
149 F_clust_dic[centcoor][0],
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
150 F_clust_dic[centcoor][-1] + margin]]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
151 for centcoor in R_clust_dic:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
152 accumulator = []
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
153 for coor in R_clust_dic[centcoor]:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
154 accumulator.extend(map_dic[(chrom, coor, 'R')])
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
155 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
156 compute the offset of the cluster due to
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
157 size of reads
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
158 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
159 first = sorted(R_clust_dic[centcoor])[0]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
160 try:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
161 margin = max(map_dic[(chrom, first, 'R')]) - 1
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
162 except ValueError:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
163 margin = 0
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
164 clustered_dic[(chrom, centcoor, 'R')] = [len(accumulator), [
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
165 R_clust_dic[centcoor][0] - margin,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
166 R_clust_dic[centcoor][-1]]]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
167 return clustered_dic
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
168
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
169 def write_table(self, mapdict, out):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
170 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
171 Writer of a tabular file
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
172 Dataset, Chromosome, Chrom_length, Coordinate, Polarity,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
173 <some mapped value>
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
174 out is an *open* file handler
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
175 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
176 for key in sorted(mapdict):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
177 line = [self.sample_name, key[0], self.chromosomes[key[0]],
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
178 key[1], key[2], mapdict[key]]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
179 line = [str(i) for i in line]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
180 out.write('\t'.join(line) + '\n')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
181
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
182 def write_cluster_table(self, clustered_dic, out, bedpath):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
183 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
184 Writer of a tabular file
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
185 Dataset, Chromosome, Chrom_length, Coordinate, Polarity,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
186 <some mapped value>
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
187 out is an *open* file handler
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
188 bed is an a file handler internal to the function
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
189 '''
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
190 def filterCluster(size, count, density):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
191 if size < args.bed_skipsize:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
192 return False
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
193 if count < args.bed_skipcounts:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
194 return False
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
195 if density <= args.bed_skipdensity:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
196 return False
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
197 return True
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
198 bed = open(bedpath, 'w')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
199 clusterid = 0
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
200 for key in sorted(clustered_dic):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
201 start = clustered_dic[key][1][0]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
202 end = clustered_dic[key][1][1]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
203 size = end - start + 1
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
204 read_count = clustered_dic[key][0]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
205 if self.nostrand:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
206 polarity = '.'
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
207 elif key[2] == 'F':
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
208 polarity = '+'
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
209 else:
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
210 polarity = '-'
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
211 density = float(read_count) / size
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
212 line = [self.sample_name, key[0], self.chromosomes[key[0]],
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
213 key[1], key[2], read_count,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
214 str(start) + "-" + str(end), str(size), str(density)]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
215 line = [str(i) for i in line]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
216 out.write('\t'.join(line) + '\n')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
217 if filterCluster(size, read_count, density):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
218 clusterid += 1
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
219 name = 'cluster_' + str(clusterid)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
220 bedline = [key[0], str(start-1), str(end), name,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
221 str(read_count), polarity, str(density)]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
222 bed.write('\t'.join(bedline) + '\n')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
223 print("number of reported clusters:", clusterid)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
224 bed.close()
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
225
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
226
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
227 def main(inputs, samples, outputs, minsize, maxsize, cluster,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
228 nostrand, bedfile=None, bed_skipsize=0):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
229 out = open(outputs, 'w')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
230 header = ["# Dataset", "Chromosome", "Chrom_length", "Coordinate",
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
231 "Polarity", "Counts", "Start-End", "Cluster Size", "density"]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
232 out.write('\t'.join(header) + '\n')
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
233 for input, sample in zip(inputs, samples):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
234 mapobj = Map(input, sample, minsize, maxsize, cluster, nostrand)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
235 mapobj.write_cluster_table(mapobj.map_dict, out, bedfile)
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
236 out.close()
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
237
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
238
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
239 if __name__ == "__main__":
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
240 args = Parser()
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
241 # if identical sample names
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
242 if len(set(args.sample_names)) != len(args.sample_names):
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
243 args.sample_names = [name + '_' + str(i) for
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
244 i, name in enumerate(args.sample_names)]
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
245 main(args.inputs, args.sample_names, args.outputs,
8028521b6e4f "planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_clusters commit f38805cf151cbda1cf7de0a92cdfeb5978f26547"
artbio
parents:
diff changeset
246 args.minsize, args.maxsize, args.cluster, args.nostrand, args.bed)