changeset 6:26014c24323a draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty
author mheinzl
date Fri, 26 Oct 2018 07:54:03 -0400
parents 52454637bc45
children 3b8a0e462021
files fsd_regions.py test-data/Test_data_regions.txt test-data/output_file.pdf test-data/output_file.tabular
diffstat 4 files changed, 50 insertions(+), 114 deletions(-) [+]
line wrap: on
line diff
--- a/fsd_regions.py	Wed Oct 17 05:23:33 2018 -0400
+++ b/fsd_regions.py	Fri Oct 26 07:54:03 2018 -0400
@@ -56,10 +56,8 @@
         data_array = readFileReferenceFree(firstFile, "\t")
 
         mut_array = readFileReferenceFree(refGenome, " ")
-        length_regions = len(mut_array)
         group = numpy.array(mut_array[:, 0])
         seq_mut = numpy.array(mut_array[:, 1])
-        alt_group = numpy.array(mut_array[:, 2])
 
         seq = numpy.array(data_array[:, 1])
         tags = numpy.array(data_array[:, 2])
@@ -73,14 +71,15 @@
         seqDic_ab = dict(zip(all_ab, quant_ab))
         seqDic_ba = dict(zip(all_ba, quant_ba))
 
-        if re.search('^(\d)+_(\d)+', str(mut_array[0,0])) is None:
+        if re.search('_(\d)+_(\d)+$', str(mut_array[0,0])) is None:
             seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)
             group = mut_array[seqMut_index,0]
-            alt_group = mut_array[seqMut_index,2]
             mut_array = mut_array[seqMut_index,:]
+        length_regions = len(seq_mut)*2
 
         groupUnique, group_index = numpy.unique(group, return_index=True)
         groupUnique = groupUnique[numpy.argsort(group_index)]
+
         lst_ab = []
         lst_ba = []
         for i in seq_mut:
@@ -90,58 +89,19 @@
         quant_ab = numpy.array(lst_ab)
         quant_ba = numpy.array(lst_ba)
 
-        quantAfterRegion = OrderedDict()
-        for key in groupUnique:
-            quantAfterRegion[key] = []
+        quantAfterRegion = []
 
         for i in groupUnique:
-            index_of_current_region = numpy.where(group == i)[0]
-            quant_ba_i = quant_ba[index_of_current_region]
-            alt_group_i = alt_group[index_of_current_region]
-            index_alternative_refs = numpy.where(alt_group_i != "=")[0]
-
-            dataAB = quant_ab[index_of_current_region]
+            dataAB = quant_ab[numpy.where(group == i)[0]]
+            dataBA = quant_ba[numpy.where(group == i)[0]]
             bigFamilies = numpy.where(dataAB > 20)[0]
             dataAB[bigFamilies] = 22
-            for el in dataAB:
-                quantAfterRegion[i].append(el)
-
-            if len(index_alternative_refs) == 0:
-                dataBA = quant_ba_i
-                bigFamilies = numpy.where(dataBA > 20)[0]
-                dataBA[bigFamilies] = 22
-                for el2 in dataBA:
-                    quantAfterRegion[i].append(el2)
-            else:  # get tags where 2nd mate is aligned to a different ref genome
-                unique_alt = numpy.unique(alt_group_i[index_alternative_refs])
-                for alt in unique_alt:
-                    ind_alt_tags = numpy.where(alt_group_i == alt)[0]
-                    dataBA = quant_ba_i[ind_alt_tags]
+            bigFamilies = numpy.where(dataBA > 20)[0]
+            dataBA[bigFamilies] = 22
 
-                    bigFamilies = numpy.where(dataBA > 20)[0]
-                    if len(bigFamilies) != 0:
-                        if len(bigFamilies) == 1 and type(dataBA) != list:
-                            dataBA = 22
-                            quantAfterRegion[alt].append(dataBA)
-                        else:
-                            dataBA[bigFamilies] = 22
-                            for el3 in dataBA:
-                                quantAfterRegion[alt].append(el3)
+            quantAll = numpy.concatenate((dataAB, dataBA))
+            quantAfterRegion.append(quantAll)
 
-                index_inverse = [x for x in range(0, len(index_of_current_region)) if x not in index_alternative_refs]
-                data_BA_other = quant_ba_i[index_inverse]
-                bigFamilies_other = numpy.where(data_BA_other > 20)[0]
-
-                if len(bigFamilies_other) != 0:
-                    if len(bigFamilies_other) == 1 and type(data_BA_other) != list:
-                        data_BA_other = 22
-                        quantAfterRegion[i].append(data_BA_other)
-                    else:
-                        data_BA_other[bigFamilies_other] = 22
-                        for el3 in data_BA_other:
-                            quantAfterRegion[i].append(el3)
-
-        quantAfterRegion = quantAfterRegion.values()
         maximumX = numpy.amax(numpy.concatenate(quantAfterRegion))
         minimumX = numpy.amin(numpy.concatenate(quantAfterRegion))
 
@@ -184,49 +144,24 @@
         plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure)
 
         plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure)
-        plt.text(0.7, 0.22, "{:,}".format(length_regions), size=11, transform=plt.gcf().transFigure)
+        plt.text(0.75, 0.22, "{:,} ({:,})".format(length_regions, length_regions/2), size=11, transform=plt.gcf().transFigure)
 
-        legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.'
-        plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
+        #  legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.'
+        #  plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
 
-        space = numpy.arange(0, len(groupUnique), 0.02)
-        plt.text(0.7, 0.18, "total number of *\nab", size=11, transform=plt.gcf().transFigure)
-        plt.text(0.78, 0.18, "ba tags", size=11, transform=plt.gcf().transFigure)
-        lengths_array_ab = []
-        lengths_array_ba = []
-
+        plt.text(0.75, 0.18, "total nr. of tags per region", size=11, transform=plt.gcf().transFigure)
+        #space = numpy.arange(0, len(groupUnique), 0.02)
+        s = 0
         index_array = 0
-        for i, s, count in zip(groupUnique, space, quantAfterRegion):
+        for i, count in zip(groupUnique, quantAfterRegion):
             index_of_current_region = numpy.where(group == i)[0]
-
             plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure)
-            if re.search('^(\d)+_(\d)+', str(mut_array[0, 0])) is None:
+            if re.search('_(\d)+_(\d)+$', str(mut_array[0, 0])) is None:
                 nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1]))
             else:
                 nr_tags_ab = len(mut_array[index_of_current_region, 1])
-
-            plt.text(0.7, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure)
-
-            alt_group_i = alt_group[index_of_current_region]
-            alternative = numpy.where(alt_group_i != "=")[0]
-            unique_alt = numpy.unique(alt_group_i[alternative])
-            lengths_of_alt_aligned_tags = []
-            if len(alternative) != 0:
-                for alt in unique_alt:
-                    ind_alt_tags = numpy.where(alt_group_i == alt)[0]
-                    name = "{:,} to {}".format(len(ind_alt_tags), alt)
-                    lengths_of_alt_aligned_tags.append(name)
-                ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0]
-                name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i)
-                lengths_of_alt_aligned_tags.append(name_inverse)
-                plt.text(0.78, 0.14 - s, "{}\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure)
-                lengths_array_ab.append(nr_tags_ab)
-                lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags))
-            else:
-                plt.text(0.78, 0.14 - s, "=\n", size=11,transform=plt.gcf().transFigure)
-                lengths_array_ab.append(nr_tags_ab)
-                lengths_array_ba.append(nr_tags_ab)
-            index_array += 1
+            plt.text(0.75, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure)
+            s = s + 0.02
 
         plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
         plt.xlabel("Family size", fontsize=14)
@@ -243,7 +178,7 @@
         output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
         output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
         output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
-        output_file.write("total nr. of tags{}{}\n".format(sep, length_regions))
+        output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions/2))
 
         output_file.write("\n\nValues from family size distribution\n")
         output_file.write("{}".format(sep))
@@ -273,11 +208,11 @@
             for i in counts[0]:
                 output_file.write("{}{}".format(int(sum(i)), sep))
         output_file.write("\n")
-        output_file.write("\n\nRegion{}total nr. of ab{}ba tags\n".format(sep, sep))
+        output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the count of the tags per region.\n")
+        output_file.write("\n\nRegion{}total nr. of tags per region\n".format(sep, sep))
 
-        for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique):
-            output_file.write("{}{}{}{}{}\n".format(i, sep, ab, sep, ba))
-
+        for i, count in zip(groupUnique, quantAfterRegion):
+            output_file.write("{}{}{}\n".format(i,sep,len(count) / 2))
     print("Files successfully created!")
 
 
--- a/test-data/Test_data_regions.txt	Wed Oct 17 05:23:33 2018 -0400
+++ b/test-data/Test_data_regions.txt	Fri Oct 26 07:54:03 2018 -0400
@@ -1,17 +1,16 @@
-87_636 AAAAAACATCCCAATAAGAAATCA
-87_636 AAAAAAGTCCTTCGACTCAAGCGG
-87_636 AAAAAATAGTTAAGCCGACACACT
-87_636 AAAAAATGTGCCGAACCTTGGCGA
-87_636 AAAAACAACATAGCTTGAAAATTT
-656_1143 ATTCGGATAATTCGACGCAACATT
-656_1143 ATTCGTCGACAATACAAAGGGGCC
-656_1143 ATTGCCAGTGTGGGCTGGTTAGTA
-656_1143 ATTTCGCGACCATCCGCCACTTTG
-656_1143 CAAACTTTAGCACAGTGTGTGTCC
-1141_1564 ATAAACGGCCTTCGACATTGTGAC
-1141_1564 ATAAAGTCACCTGTGAATACGTTG
-1141_1564 ATAAATCGAAACCGTGCCCAACAA
-1892_2398 ATTTAGATATTTTCTTCTTTTTCT
-1892_2398 ATTTAGTTATCCGTCGGCGACGAA
-1892_2398 ATTTAGTTTGAATTGCCCTGCGTC
-
+ACH_87_636 AAAAAACATCCCAATAAGAAATCA
+ACH_87_636 AAAAAAGTCCTTCGACTCAAGCGG
+ACH_87_636 AAAAAATAGTTAAGCCGACACACT
+ACH_87_636 AAAAAATGTGCCGAACCTTGGCGA
+ACH_87_636 AAAAACAACATAGCTTGAAAATTT
+ACH_656_1143 ATTCGGATAATTCGACGCAACATT
+ACH_656_1143 ATTCGTCGACAATACAAAGGGGCC
+ACH_656_1143 ATTGCCAGTGTGGGCTGGTTAGTA
+ACH_656_1143 ATTTCGCGACCATCCGCCACTTTG
+ACH_656_1143 CAAACTTTAGCACAGTGTGTGTCC
+ACH_1141_1564 ATAAACGGCCTTCGACATTGTGAC
+ACH_1141_1564 ATAAAGTCACCTGTGAATACGTTG
+ACH_1141_1564 ATAAATCGAAACCGTGCCCAACAA
+ACH_1892_2398 ATTTAGATATTTTCTTCTTTTTCT
+ACH_1892_2398 ATTTAGTTATCCGTCGGCGACGAA
+ACH_1892_2398 ATTTAGTTTGAATTGCCCTGCGTC
Binary file test-data/output_file.pdf has changed
--- a/test-data/output_file.tabular	Wed Oct 17 05:23:33 2018 -0400
+++ b/test-data/output_file.tabular	Fri Oct 26 07:54:03 2018 -0400
@@ -5,10 +5,11 @@
 relative frequency:	0.209	0.062
 
 total nr. of reads	1312
+total nr. of tags	32 (16)
 
 
 Values from family size distribution
-	87_636	656_1143	1141_1564	1892_2398	
+	ACH_87_636	ACH_656_1143	ACH_1141_1564	ACH_1892_2398	
 FS=3	0	0	0	1	
 FS=4	2	0	0	0	
 FS=5	2	0	0	0	
@@ -32,10 +33,11 @@
 
 
 In the plot, both family sizes of the ab and ba strands were used.
-Whereas the total numbers indicate only the single count of the tags per region.
+Whereas the total numbers indicate only the count of the tags per region.
+
+
 Region	total nr. of tags per region
-87_636	5
-656_1143	5
-1141_1564	3
-1892_2398	3
-sum of tags	16
+ACH_87_636	5
+ACH_656_1143	5
+ACH_1141_1564	3
+ACH_1892_2398	3