Repository 'fsd_regions'
hg clone https://toolshed.g2.bx.psu.edu/repos/mheinzl/fsd_regions

Changeset 6:26014c24323a (2018-10-26)
Previous changeset 5:52454637bc45 (2018-10-17) Next changeset 7:3b8a0e462021 (2018-11-20)
Commit message:
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty
modified:
fsd_regions.py
test-data/Test_data_regions.txt
test-data/output_file.pdf
test-data/output_file.tabular
b
diff -r 52454637bc45 -r 26014c24323a fsd_regions.py
--- a/fsd_regions.py Wed Oct 17 05:23:33 2018 -0400
+++ b/fsd_regions.py Fri Oct 26 07:54:03 2018 -0400
[
b'@@ -56,10 +56,8 @@\n         data_array = readFileReferenceFree(firstFile, "\\t")\n \n         mut_array = readFileReferenceFree(refGenome, " ")\n-        length_regions = len(mut_array)\n         group = numpy.array(mut_array[:, 0])\n         seq_mut = numpy.array(mut_array[:, 1])\n-        alt_group = numpy.array(mut_array[:, 2])\n \n         seq = numpy.array(data_array[:, 1])\n         tags = numpy.array(data_array[:, 2])\n@@ -73,14 +71,15 @@\n         seqDic_ab = dict(zip(all_ab, quant_ab))\n         seqDic_ba = dict(zip(all_ba, quant_ba))\n \n-        if re.search(\'^(\\d)+_(\\d)+\', str(mut_array[0,0])) is None:\n+        if re.search(\'_(\\d)+_(\\d)+$\', str(mut_array[0,0])) is None:\n             seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)\n             group = mut_array[seqMut_index,0]\n-            alt_group = mut_array[seqMut_index,2]\n             mut_array = mut_array[seqMut_index,:]\n+        length_regions = len(seq_mut)*2\n \n         groupUnique, group_index = numpy.unique(group, return_index=True)\n         groupUnique = groupUnique[numpy.argsort(group_index)]\n+\n         lst_ab = []\n         lst_ba = []\n         for i in seq_mut:\n@@ -90,58 +89,19 @@\n         quant_ab = numpy.array(lst_ab)\n         quant_ba = numpy.array(lst_ba)\n \n-        quantAfterRegion = OrderedDict()\n-        for key in groupUnique:\n-            quantAfterRegion[key] = []\n+        quantAfterRegion = []\n \n         for i in groupUnique:\n-            index_of_current_region = numpy.where(group == i)[0]\n-            quant_ba_i = quant_ba[index_of_current_region]\n-            alt_group_i = alt_group[index_of_current_region]\n-            index_alternative_refs = numpy.where(alt_group_i != "=")[0]\n-\n-            dataAB = quant_ab[index_of_current_region]\n+            dataAB = quant_ab[numpy.where(group == i)[0]]\n+            dataBA = quant_ba[numpy.where(group == i)[0]]\n             bigFamilies = numpy.where(dataAB > 20)[0]\n             dataAB[bigFamilies] = 22\n-            for el in dataAB:\n-                quantAfterRegion[i].append(el)\n-\n-            if len(index_alternative_refs) == 0:\n-                dataBA = quant_ba_i\n-                bigFamilies = numpy.where(dataBA > 20)[0]\n-                dataBA[bigFamilies] = 22\n-                for el2 in dataBA:\n-                    quantAfterRegion[i].append(el2)\n-            else:  # get tags where 2nd mate is aligned to a different ref genome\n-                unique_alt = numpy.unique(alt_group_i[index_alternative_refs])\n-                for alt in unique_alt:\n-                    ind_alt_tags = numpy.where(alt_group_i == alt)[0]\n-                    dataBA = quant_ba_i[ind_alt_tags]\n+            bigFamilies = numpy.where(dataBA > 20)[0]\n+            dataBA[bigFamilies] = 22\n \n-                    bigFamilies = numpy.where(dataBA > 20)[0]\n-                    if len(bigFamilies) != 0:\n-                        if len(bigFamilies) == 1 and type(dataBA) != list:\n-                            dataBA = 22\n-                            quantAfterRegion[alt].append(dataBA)\n-                        else:\n-                            dataBA[bigFamilies] = 22\n-                            for el3 in dataBA:\n-                                quantAfterRegion[alt].append(el3)\n+            quantAll = numpy.concatenate((dataAB, dataBA))\n+            quantAfterRegion.append(quantAll)\n \n-                index_inverse = [x for x in range(0, len(index_of_current_region)) if x not in index_alternative_refs]\n-                data_BA_other = quant_ba_i[index_inverse]\n-                bigFamilies_other = numpy.where(data_BA_other > 20)[0]\n-\n-                if len(bigFamilies_other) != 0:\n-                    if len(bigFamilies_other) == 1 and type(data_BA_other) != list:\n-                        data_BA_other = 22\n-                        quantAfterRegion[i].append(data_BA_other)\n-                    else:\n-                        data_BA_other[bigFamilies_other] = 22\n-                        for el3 in data'..b'=11, transform=plt.gcf().transFigure)\n+        #space = numpy.arange(0, len(groupUnique), 0.02)\n+        s = 0\n         index_array = 0\n-        for i, s, count in zip(groupUnique, space, quantAfterRegion):\n+        for i, count in zip(groupUnique, quantAfterRegion):\n             index_of_current_region = numpy.where(group == i)[0]\n-\n             plt.text(0.55, 0.14 - s, "{}=\\n".format(i), size=11, transform=plt.gcf().transFigure)\n-            if re.search(\'^(\\d)+_(\\d)+\', str(mut_array[0, 0])) is None:\n+            if re.search(\'_(\\d)+_(\\d)+$\', str(mut_array[0, 0])) is None:\n                 nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1]))\n             else:\n                 nr_tags_ab = len(mut_array[index_of_current_region, 1])\n-\n-            plt.text(0.7, 0.14 - s, "{:,}\\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure)\n-\n-            alt_group_i = alt_group[index_of_current_region]\n-            alternative = numpy.where(alt_group_i != "=")[0]\n-            unique_alt = numpy.unique(alt_group_i[alternative])\n-            lengths_of_alt_aligned_tags = []\n-            if len(alternative) != 0:\n-                for alt in unique_alt:\n-                    ind_alt_tags = numpy.where(alt_group_i == alt)[0]\n-                    name = "{:,} to {}".format(len(ind_alt_tags), alt)\n-                    lengths_of_alt_aligned_tags.append(name)\n-                ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0]\n-                name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i)\n-                lengths_of_alt_aligned_tags.append(name_inverse)\n-                plt.text(0.78, 0.14 - s, "{}\\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure)\n-                lengths_array_ab.append(nr_tags_ab)\n-                lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags))\n-            else:\n-                plt.text(0.78, 0.14 - s, "=\\n", size=11,transform=plt.gcf().transFigure)\n-                lengths_array_ab.append(nr_tags_ab)\n-                lengths_array_ba.append(nr_tags_ab)\n-            index_array += 1\n+            plt.text(0.75, 0.14 - s, "{:,}\\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure)\n+            s = s + 0.02\n \n         plt.legend(loc=\'upper right\', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)\n         plt.xlabel("Family size", fontsize=14)\n@@ -243,7 +178,7 @@\n         output_file.write("absolute frequency:{}{}{}{}\\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))\n         output_file.write("relative frequency:{}{:.3f}{}{:.3f}\\n\\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))\n         output_file.write("total nr. of reads{}{}\\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))\n-        output_file.write("total nr. of tags{}{}\\n".format(sep, length_regions))\n+        output_file.write("total nr. of tags{}{} ({})\\n".format(sep, length_regions, length_regions/2))\n \n         output_file.write("\\n\\nValues from family size distribution\\n")\n         output_file.write("{}".format(sep))\n@@ -273,11 +208,11 @@\n             for i in counts[0]:\n                 output_file.write("{}{}".format(int(sum(i)), sep))\n         output_file.write("\\n")\n-        output_file.write("\\n\\nRegion{}total nr. of ab{}ba tags\\n".format(sep, sep))\n+        output_file.write("\\n\\nIn the plot, both family sizes of the ab and ba strands were used.\\nWhereas the total numbers indicate only the count of the tags per region.\\n")\n+        output_file.write("\\n\\nRegion{}total nr. of tags per region\\n".format(sep, sep))\n \n-        for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique):\n-            output_file.write("{}{}{}{}{}\\n".format(i, sep, ab, sep, ba))\n-\n+        for i, count in zip(groupUnique, quantAfterRegion):\n+            output_file.write("{}{}{}\\n".format(i,sep,len(count) / 2))\n     print("Files successfully created!")\n \n \n'
b
diff -r 52454637bc45 -r 26014c24323a test-data/Test_data_regions.txt
--- a/test-data/Test_data_regions.txt Wed Oct 17 05:23:33 2018 -0400
+++ b/test-data/Test_data_regions.txt Fri Oct 26 07:54:03 2018 -0400
b
@@ -1,17 +1,16 @@
-87_636 AAAAAACATCCCAATAAGAAATCA
-87_636 AAAAAAGTCCTTCGACTCAAGCGG
-87_636 AAAAAATAGTTAAGCCGACACACT
-87_636 AAAAAATGTGCCGAACCTTGGCGA
-87_636 AAAAACAACATAGCTTGAAAATTT
-656_1143 ATTCGGATAATTCGACGCAACATT
-656_1143 ATTCGTCGACAATACAAAGGGGCC
-656_1143 ATTGCCAGTGTGGGCTGGTTAGTA
-656_1143 ATTTCGCGACCATCCGCCACTTTG
-656_1143 CAAACTTTAGCACAGTGTGTGTCC
-1141_1564 ATAAACGGCCTTCGACATTGTGAC
-1141_1564 ATAAAGTCACCTGTGAATACGTTG
-1141_1564 ATAAATCGAAACCGTGCCCAACAA
-1892_2398 ATTTAGATATTTTCTTCTTTTTCT
-1892_2398 ATTTAGTTATCCGTCGGCGACGAA
-1892_2398 ATTTAGTTTGAATTGCCCTGCGTC
-
+ACH_87_636 AAAAAACATCCCAATAAGAAATCA
+ACH_87_636 AAAAAAGTCCTTCGACTCAAGCGG
+ACH_87_636 AAAAAATAGTTAAGCCGACACACT
+ACH_87_636 AAAAAATGTGCCGAACCTTGGCGA
+ACH_87_636 AAAAACAACATAGCTTGAAAATTT
+ACH_656_1143 ATTCGGATAATTCGACGCAACATT
+ACH_656_1143 ATTCGTCGACAATACAAAGGGGCC
+ACH_656_1143 ATTGCCAGTGTGGGCTGGTTAGTA
+ACH_656_1143 ATTTCGCGACCATCCGCCACTTTG
+ACH_656_1143 CAAACTTTAGCACAGTGTGTGTCC
+ACH_1141_1564 ATAAACGGCCTTCGACATTGTGAC
+ACH_1141_1564 ATAAAGTCACCTGTGAATACGTTG
+ACH_1141_1564 ATAAATCGAAACCGTGCCCAACAA
+ACH_1892_2398 ATTTAGATATTTTCTTCTTTTTCT
+ACH_1892_2398 ATTTAGTTATCCGTCGGCGACGAA
+ACH_1892_2398 ATTTAGTTTGAATTGCCCTGCGTC
b
diff -r 52454637bc45 -r 26014c24323a test-data/output_file.pdf
b
Binary file test-data/output_file.pdf has changed
b
diff -r 52454637bc45 -r 26014c24323a test-data/output_file.tabular
--- a/test-data/output_file.tabular Wed Oct 17 05:23:33 2018 -0400
+++ b/test-data/output_file.tabular Fri Oct 26 07:54:03 2018 -0400
b
@@ -5,10 +5,11 @@
 relative frequency: 0.209 0.062
 
 total nr. of reads 1312
+total nr. of tags 32 (16)
 
 
 Values from family size distribution
- 87_636 656_1143 1141_1564 1892_2398
+ ACH_87_636 ACH_656_1143 ACH_1141_1564 ACH_1892_2398
 FS=3 0 0 0 1
 FS=4 2 0 0 0
 FS=5 2 0 0 0
@@ -32,10 +33,11 @@
 
 
 In the plot, both family sizes of the ab and ba strands were used.
-Whereas the total numbers indicate only the single count of the tags per region.
+Whereas the total numbers indicate only the count of the tags per region.
+
+
 Region total nr. of tags per region
-87_636 5
-656_1143 5
-1141_1564 3
-1892_2398 3
-sum of tags 16
+ACH_87_636 5
+ACH_656_1143 5
+ACH_1141_1564 3
+ACH_1892_2398 3