# HG changeset patch # User artbio # Date 1510075888 18000 # Node ID 3ea75c5734292b983c9fe7f53d7f70941d341802 # Parent 1827b74f872bd4015d6c5fba8b4ed48935f862ba planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_rna_maps commit 6199193c7fe2cb56403eea8af0b40d44f7311fd5 diff -r 1827b74f872b -r 3ea75c573429 small_rna_maps.py --- a/small_rna_maps.py Mon Oct 23 08:29:39 2017 -0400 +++ b/small_rna_maps.py Tue Nov 07 12:31:28 2017 -0500 @@ -14,6 +14,8 @@ default=0, help='minimal size of reads') the_parser.add_argument('--maxsize', dest='maxsize', type=int, default=10000, help='maximal size of reads') + the_parser.add_argument('--cluster', dest='cluster', type=int, + default=0, help='clustering distance') the_parser.add_argument('--sample_names', dest='sample_names', required=True, nargs='+', help='list of sample names') @@ -28,15 +30,18 @@ class Map: - def __init__(self, bam_file, sample, minsize, maxsize): + def __init__(self, bam_file, sample, minsize, maxsize, cluster): self.sample_name = sample self.minsize = minsize self.maxsize = maxsize + self.cluster = cluster self.bam_object = pysam.AlignmentFile(bam_file, 'rb') self.chromosomes = dict(zip(self.bam_object.references, self.bam_object.lengths)) self.map_dict = self.create_map(self.bam_object, self.minsize, self.maxsize) + if self.cluster: + self.map_dict = self.tile_map(self.map_dict, self.cluster) def create_map(self, bam_object, minsize, maxsize): ''' @@ -44,11 +49,10 @@ [read_length, ...]} ''' map_dictionary = defaultdict(list) - # get empty value for start and end of each chromosome for chrom in self.chromosomes: + # get empty value for start and end of each chromosome map_dictionary[(chrom, 1, 'F')] = [] map_dictionary[(chrom, self.chromosomes[chrom], 'F')] = [] - for chrom in self.chromosomes: for read in bam_object.fetch(chrom): if (read.query_alignment_length >= minsize and read.query_alignment_length <= maxsize): @@ -61,6 +65,62 @@ read.query_alignment_length) return map_dictionary + def grouper(self, iterable, clust_distance): + prev = None + group = [] + for item in iterable: + if not prev or item - prev <= clust_distance: + group.append(item) + else: + yield group + group = [item] + prev = item + if group: + yield group + + def tile_map(self, map_dic, clust_distance): + ''' + takes a map_dictionary {(chromosome,read_position,polarity): + [read_length, ...]} + and retur a map_dictionary with same structure but with + read positions aggregated by size + ''' + clustered_dic = defaultdict(list) + for chrom in self.chromosomes: + clustered_dic[(chrom, 1, 'F')] = [] + clustered_dic[(chrom, self.chromosomes[chrom], 'F')] = [] + F_chrom_coord = [] + R_chrom_coord = [] + for key in map_dic: + if key[0] == chrom and key[2] == 'F': + F_chrom_coord.append(key[1]) + elif key[0] == chrom and key[2] == 'R': + R_chrom_coord.append(key[1]) + F_chrom_coord = list(set(F_chrom_coord)) + R_chrom_coord = list(set(R_chrom_coord)) + F_chrom_coord.sort() + R_chrom_coord.sort() + F_clust_values = [i for i in self.grouper(F_chrom_coord, + clust_distance)] + F_clust_keys = [(i[-1]+i[0])/2 for i in F_clust_values] + R_clust_values = [i for i in self.grouper(R_chrom_coord, + clust_distance)] + R_clust_keys = [(i[-1]+i[0])/2 for i in R_clust_values] + F_clust_dic = dict(zip(F_clust_keys, F_clust_values)) + R_clust_dic = dict(zip(R_clust_keys, R_clust_values)) + # {centered_coordinate: [coord1, coord2, coord3, ..]} + for centcoor in F_clust_dic: + accumulator = [] + for coor in F_clust_dic[centcoor]: + accumulator.extend(map_dic[(chrom, coor, 'F')]) + clustered_dic[(chrom, centcoor, 'F')] = accumulator + for centcoor in R_clust_dic: + accumulator = [] + for coor in R_clust_dic[centcoor]: + accumulator.extend(map_dic[(chrom, coor, 'R')]) + clustered_dic[(chrom, centcoor, 'R')] = accumulator + return clustered_dic + def compute_readcount(self, map_dictionary, out): ''' takes a map_dictionary as input and writes @@ -191,7 +251,7 @@ out.write('\t'.join(line) + '\n') -def main(inputs, samples, methods, outputs, minsize, maxsize): +def main(inputs, samples, methods, outputs, minsize, maxsize, cluster): for method, output in zip(methods, outputs): F = open(output, 'w') if method == 'Size': @@ -201,7 +261,7 @@ "Polarity", method] F.write('\t'.join(header) + '\n') for input, sample in zip(inputs, samples): - mapobj = Map(input, sample, minsize, maxsize) + mapobj = Map(input, sample, minsize, maxsize, cluster) token = {"Counts": mapobj.compute_readcount, "Max": mapobj.compute_max, "Mean": mapobj.compute_mean, @@ -219,4 +279,4 @@ args.sample_names = [name + '_' + str(i) for i, name in enumerate(args.sample_names)] main(args.inputs, args.sample_names, args.plot_methods, args.outputs, - args.minsize, args.maxsize) + args.minsize, args.maxsize, args.cluster) diff -r 1827b74f872b -r 3ea75c573429 small_rna_maps.xml --- a/small_rna_maps.xml Mon Oct 23 08:29:39 2017 -0400 +++ b/small_rna_maps.xml Tue Nov 07 12:31:28 2017 -0500 @@ -1,4 +1,4 @@ - + numpy @@ -27,6 +27,7 @@ #end for --minsize $minsize --maxsize $maxsize + --cluster $cluster #if str($plots_options.plots_options_selector ) == "two_plot": --plot_methods '${plots_options.first_plot}' '${plots_options.extra_plot}' --outputs '$output_tab' '$extra_output_tab' && @@ -62,6 +63,8 @@ value="0" help="default value: 0" /> + @@ -115,7 +118,19 @@ + + + + + + + + + + + + @@ -127,6 +142,7 @@ + @@ -138,6 +154,7 @@ + @@ -149,6 +166,7 @@ + @@ -161,6 +179,7 @@ + @@ -173,6 +192,7 @@ + @@ -185,6 +205,7 @@ + @@ -197,6 +218,7 @@ + @@ -209,6 +231,7 @@ + @@ -221,6 +244,7 @@ + @@ -233,6 +257,7 @@ + @@ -244,6 +269,7 @@ + @@ -255,6 +281,7 @@ + @@ -265,6 +292,7 @@ + @@ -275,6 +303,7 @@ + diff -r 1827b74f872b -r 3ea75c573429 test-data/count_cluster_5.pdf Binary file test-data/count_cluster_5.pdf has changed diff -r 1827b74f872b -r 3ea75c573429 test-data/count_cluster_5.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/count_cluster_5.tab Tue Nov 07 12:31:28 2017 -0500 @@ -0,0 +1,265 @@ +Dataset Chromosome Chrom_length Coordinate Polarity Counts +input2.bam FBtr0070001 72 1 F 1 +input2.bam FBtr0070001 72 15 F 4 +input2.bam FBtr0070001 72 30 F 4 +input2.bam FBtr0070001 72 55 F 15 +input2.bam FBtr0070001 72 72 F 0 +input2.bam FBtr0070533 72 1 F 0 +input2.bam FBtr0070533 72 21 F 44 +input2.bam FBtr0070533 72 39 F 1 +input2.bam FBtr0070533 72 72 F 0 +input2.bam FBtr0070603 72 1 F 0 +input2.bam FBtr0070603 72 26 F 57 +input2.bam FBtr0070603 72 52 F 4 +input2.bam FBtr0070603 72 72 F 0 +input2.bam FBtr0070604 72 1 F 0 +input2.bam FBtr0070604 72 2 F 1 +input2.bam FBtr0070604 72 31 F 6 +input2.bam FBtr0070604 72 40 F 1 +input2.bam FBtr0070604 72 55 F 47 +input2.bam FBtr0070604 72 72 F 0 +input2.bam FBtr0070911 73 1 F 0 +input2.bam FBtr0070911 73 11 F 1 +input2.bam FBtr0070911 73 32 F 1 +input2.bam FBtr0070911 73 48 F 1 +input2.bam FBtr0070911 73 73 F 0 +input2.bam FBtr0078490 72 1 F 3 +input2.bam FBtr0078490 72 13 F 6 +input2.bam FBtr0078490 72 20 F 1 +input2.bam FBtr0078490 72 30 F 1 +input2.bam FBtr0078490 72 39 F 3 +input2.bam FBtr0078490 72 72 F 0 +input2.bam FBtr0078580 72 1 F 146 +input2.bam FBtr0078580 72 21 F 215 +input2.bam FBtr0078580 72 41 F 6 +input2.bam FBtr0078580 72 52 F 3 +input2.bam FBtr0078580 72 72 F 0 +input2.bam FBtr0078790 73 1 F 0 +input2.bam FBtr0078790 73 17 F 1 +input2.bam FBtr0078790 73 47 F 9 +input2.bam FBtr0078790 73 73 F 0 +input2.bam FBtr0079064 72 1 F 3 +input2.bam FBtr0079064 72 52 F 2 +input2.bam FBtr0079064 72 72 F 0 +input2.bam FBtr0079090 72 1 F 1 +input2.bam FBtr0079090 72 18 F 2 +input2.bam FBtr0079090 72 34 F 2 +input2.bam FBtr0079090 72 52 F 2 +input2.bam FBtr0079090 72 72 F 0 +input2.bam FBtr0079338 73 1 F 0 +input2.bam FBtr0079338 73 15 F 19 +input2.bam FBtr0079338 73 40 F 4 +input2.bam FBtr0079338 73 73 F 0 +input2.bam FBtr0079528 71 1 F 0 +input2.bam FBtr0079528 71 2 F 3 +input2.bam FBtr0079528 71 16 F 114 +input2.bam FBtr0079528 71 43 F 248 +input2.bam FBtr0079528 71 71 F 0 +input2.bam FBtr0079596 73 1 F 0 +input2.bam FBtr0079596 73 11 F 311 +input2.bam FBtr0079596 73 42 F 1 +input2.bam FBtr0079596 73 53 F 3 +input2.bam FBtr0079596 73 59 R 1 +input2.bam FBtr0079596 73 73 F 0 +input2.bam FBtr0079677 72 1 F 2 +input2.bam FBtr0079677 72 15 F 1 +input2.bam FBtr0079677 72 24 F 1 +input2.bam FBtr0079677 72 43 F 1 +input2.bam FBtr0079677 72 53 F 2 +input2.bam FBtr0079677 72 72 F 0 +input2.bam FBtr0079690 72 1 F 1 +input2.bam FBtr0079690 72 18 F 4 +input2.bam FBtr0079690 72 72 F 0 +input2.bam FBtr0079692 73 1 F 0 +input2.bam FBtr0079692 73 8 F 4 +input2.bam FBtr0079692 73 73 F 0 +input2.bam FBtr0079693 72 1 F 3 +input2.bam FBtr0079693 72 18 F 3 +input2.bam FBtr0079693 72 45 F 1 +input2.bam FBtr0079693 72 53 F 2 +input2.bam FBtr0079693 72 72 F 0 +input2.bam FBtr0079694 72 1 F 2 +input2.bam FBtr0079694 72 17 F 3 +input2.bam FBtr0079694 72 52 F 1 +input2.bam FBtr0079694 72 72 F 0 +input2.bam FBtr0079702 72 1 F 2 +input2.bam FBtr0079702 72 52 F 2 +input2.bam FBtr0079702 72 72 F 0 +input2.bam FBtr0079728 72 1 F 3 +input2.bam FBtr0079728 72 20 F 1 +input2.bam FBtr0079728 72 52 F 2 +input2.bam FBtr0079728 72 72 F 0 +input2.bam FBtr0079729 72 1 F 1 +input2.bam FBtr0079729 72 19 F 1 +input2.bam FBtr0079729 72 33 F 1 +input2.bam FBtr0079729 72 53 F 1 +input2.bam FBtr0079729 72 72 F 0 +input2.bam FBtr0079752 72 1 F 0 +input2.bam FBtr0079752 72 50 F 2 +input2.bam FBtr0079752 72 72 F 0 +input2.bam FBtr0079820 74 1 F 0 +input2.bam FBtr0079820 74 39 F 1 +input2.bam FBtr0079820 74 56 F 108 +input2.bam FBtr0079820 74 74 F 0 +input2.bam FBtr0080609 72 1 F 0 +input2.bam FBtr0080609 72 21 F 4 +input2.bam FBtr0080609 72 33 F 1 +input2.bam FBtr0080609 72 39 F 1 +input2.bam FBtr0080609 72 52 F 2 +input2.bam FBtr0080609 72 72 F 0 +input2.bam FBtr0080644 72 1 F 0 +input2.bam FBtr0080644 72 57 F 8 +input2.bam FBtr0080644 72 72 F 0 +input2.bam FBtr0080646 72 1 F 0 +input2.bam FBtr0080646 72 51 F 1 +input2.bam FBtr0080646 72 57 F 11 +input2.bam FBtr0080646 72 72 F 0 +input2.bam FBtr0080647 72 1 F 1 +input2.bam FBtr0080647 72 57 F 11 +input2.bam FBtr0080647 72 72 F 0 +input2.bam FBtr0080660 72 1 F 0 +input2.bam FBtr0080660 72 58 F 11 +input2.bam FBtr0080660 72 72 F 0 +input2.bam FBtr0080663 71 1 F 2 +input2.bam FBtr0080663 71 16 F 95 +input2.bam FBtr0080663 71 31 F 1 +input2.bam FBtr0080663 71 53 F 8 +input2.bam FBtr0080663 71 71 F 0 +input2.bam FBtr0080664 71 1 F 0 +input2.bam FBtr0080664 71 2 F 3 +input2.bam FBtr0080664 71 17 F 114 +input2.bam FBtr0080664 71 43 F 270 +input2.bam FBtr0080664 71 62 R 1 +input2.bam FBtr0080664 71 71 F 0 +input2.bam FBtr0080664 71 71 R 1 +input1.bam FBtr0070001 72 1 F 1 +input1.bam FBtr0070001 72 12 F 14 +input1.bam FBtr0070001 72 30 F 42 +input1.bam FBtr0070001 72 72 F 0 +input1.bam FBtr0070533 72 1 F 1 +input1.bam FBtr0070533 72 25 F 23 +input1.bam FBtr0070533 72 72 F 0 +input1.bam FBtr0070603 72 1 F 0 +input1.bam FBtr0070603 72 21 F 68 +input1.bam FBtr0070603 72 53 F 2 +input1.bam FBtr0070603 72 72 F 0 +input1.bam FBtr0070604 72 1 F 1 +input1.bam FBtr0070604 72 20 F 2 +input1.bam FBtr0070604 72 31 F 36 +input1.bam FBtr0070604 72 57 F 1 +input1.bam FBtr0070604 72 72 F 0 +input1.bam FBtr0070911 73 1 F 0 +input1.bam FBtr0070911 73 15 F 1 +input1.bam FBtr0070911 73 38 F 1 +input1.bam FBtr0070911 73 73 F 0 +input1.bam FBtr0078490 72 1 F 0 +input1.bam FBtr0078490 72 15 F 4 +input1.bam FBtr0078490 72 26 F 3 +input1.bam FBtr0078490 72 38 F 1 +input1.bam FBtr0078490 72 72 F 0 +input1.bam FBtr0078580 72 1 F 0 +input1.bam FBtr0078580 72 16 F 1102 +input1.bam FBtr0078580 72 52 F 2 +input1.bam FBtr0078580 72 72 F 0 +input1.bam FBtr0078790 73 1 F 1 +input1.bam FBtr0078790 73 17 F 2 +input1.bam FBtr0078790 73 33 F 1 +input1.bam FBtr0078790 73 47 F 9 +input1.bam FBtr0078790 73 69 R 1 +input1.bam FBtr0078790 73 73 F 0 +input1.bam FBtr0079064 72 1 F 0 +input1.bam FBtr0079064 72 2 F 2 +input1.bam FBtr0079064 72 33 F 1 +input1.bam FBtr0079064 72 52 F 1 +input1.bam FBtr0079064 72 72 F 0 +input1.bam FBtr0079090 72 1 F 2 +input1.bam FBtr0079090 72 26 F 1 +input1.bam FBtr0079090 72 33 F 1 +input1.bam FBtr0079090 72 53 F 1 +input1.bam FBtr0079090 72 56 R 1 +input1.bam FBtr0079090 72 72 F 0 +input1.bam FBtr0079338 73 1 F 0 +input1.bam FBtr0079338 73 14 F 5 +input1.bam FBtr0079338 73 25 F 1 +input1.bam FBtr0079338 73 44 F 10 +input1.bam FBtr0079338 73 73 F 0 +input1.bam FBtr0079528 71 1 F 0 +input1.bam FBtr0079528 71 9 F 97 +input1.bam FBtr0079528 71 28 F 1 +input1.bam FBtr0079528 71 36 F 3 +input1.bam FBtr0079528 71 51 F 5 +input1.bam FBtr0079528 71 71 F 0 +input1.bam FBtr0079596 73 1 F 0 +input1.bam FBtr0079596 73 10 F 148 +input1.bam FBtr0079596 73 53 F 4 +input1.bam FBtr0079596 73 73 F 0 +input1.bam FBtr0079677 72 1 F 0 +input1.bam FBtr0079677 72 3 F 2 +input1.bam FBtr0079677 72 52 F 2 +input1.bam FBtr0079677 72 72 F 0 +input1.bam FBtr0079690 72 1 F 1 +input1.bam FBtr0079690 72 24 F 2 +input1.bam FBtr0079690 72 33 F 2 +input1.bam FBtr0079690 72 72 F 0 +input1.bam FBtr0079692 73 1 F 3 +input1.bam FBtr0079692 73 18 F 1 +input1.bam FBtr0079692 73 25 F 1 +input1.bam FBtr0079692 73 32 F 1 +input1.bam FBtr0079692 73 73 F 0 +input1.bam FBtr0079693 72 1 F 5 +input1.bam FBtr0079693 72 25 F 1 +input1.bam FBtr0079693 72 72 F 0 +input1.bam FBtr0079694 72 1 F 5 +input1.bam FBtr0079694 72 18 F 1 +input1.bam FBtr0079694 72 52 F 1 +input1.bam FBtr0079694 72 72 F 0 +input1.bam FBtr0079702 72 1 F 1 +input1.bam FBtr0079702 72 19 F 2 +input1.bam FBtr0079702 72 56 F 1 +input1.bam FBtr0079702 72 72 F 0 +input1.bam FBtr0079728 72 1 F 2 +input1.bam FBtr0079728 72 8 F 1 +input1.bam FBtr0079728 72 19 F 1 +input1.bam FBtr0079728 72 33 F 3 +input1.bam FBtr0079728 72 56 F 1 +input1.bam FBtr0079728 72 72 F 0 +input1.bam FBtr0079729 72 1 F 1 +input1.bam FBtr0079729 72 13 F 1 +input1.bam FBtr0079729 72 54 F 2 +input1.bam FBtr0079729 72 72 F 0 +input1.bam FBtr0079752 72 1 F 2 +input1.bam FBtr0079752 72 9 F 2 +input1.bam FBtr0079752 72 33 F 2 +input1.bam FBtr0079752 72 52 F 2 +input1.bam FBtr0079752 72 72 F 0 +input1.bam FBtr0079820 74 1 F 0 +input1.bam FBtr0079820 74 50 F 13 +input1.bam FBtr0079820 74 74 F 0 +input1.bam FBtr0080609 72 1 F 0 +input1.bam FBtr0080609 72 10 F 60 +input1.bam FBtr0080609 72 42 F 1 +input1.bam FBtr0080609 72 51 F 2 +input1.bam FBtr0080609 72 72 F 0 +input1.bam FBtr0080644 72 1 F 0 +input1.bam FBtr0080644 72 6 F 6 +input1.bam FBtr0080644 72 72 F 0 +input1.bam FBtr0080646 72 1 F 0 +input1.bam FBtr0080646 72 4 F 3 +input1.bam FBtr0080646 72 72 F 0 +input1.bam FBtr0080647 72 1 F 0 +input1.bam FBtr0080647 72 6 F 10 +input1.bam FBtr0080647 72 72 F 0 +input1.bam FBtr0080660 72 1 F 0 +input1.bam FBtr0080660 72 4 F 7 +input1.bam FBtr0080660 72 72 F 0 +input1.bam FBtr0080663 71 1 F 0 +input1.bam FBtr0080663 71 9 F 102 +input1.bam FBtr0080663 71 26 F 1 +input1.bam FBtr0080663 71 32 F 2 +input1.bam FBtr0080663 71 50 F 10 +input1.bam FBtr0080663 71 71 F 0 +input1.bam FBtr0080664 71 1 F 0 +input1.bam FBtr0080664 71 11 F 108 +input1.bam FBtr0080664 71 33 F 9 +input1.bam FBtr0080664 71 53 F 11 +input1.bam FBtr0080664 71 71 F 0