Repository 'fsd_regions'
hg clone https://toolshed.g2.bx.psu.edu/repos/mheinzl/fsd_regions

Changeset 5:52454637bc45 (2018-10-17)
Previous changeset 4:b202c97deabe (2018-10-08) Next changeset 6:26014c24323a (2018-10-26)
Commit message:
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty
modified:
fsd_regions.py
b
diff -r b202c97deabe -r 52454637bc45 fsd_regions.py
--- a/fsd_regions.py Mon Oct 08 05:53:50 2018 -0400
+++ b/fsd_regions.py Wed Oct 17 05:23:33 2018 -0400
[
b'@@ -13,7 +13,9 @@\n # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome  filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf\n \n import argparse\n+import re\n import sys\n+from collections import OrderedDict\n \n import matplotlib.pyplot as plt\n import numpy\n@@ -55,11 +57,13 @@\n \n         mut_array = readFileReferenceFree(refGenome, " ")\n         length_regions = len(mut_array)\n+        group = numpy.array(mut_array[:, 0])\n+        seq_mut = numpy.array(mut_array[:, 1])\n+        alt_group = numpy.array(mut_array[:, 2])\n \n         seq = numpy.array(data_array[:, 1])\n         tags = numpy.array(data_array[:, 2])\n         quant = numpy.array(data_array[:, 0]).astype(int)\n-        group = numpy.array(mut_array[:, 0])\n \n         all_ab = seq[numpy.where(tags == "ab")[0]]\n         all_ba = seq[numpy.where(tags == "ba")[0]]\n@@ -69,34 +73,75 @@\n         seqDic_ab = dict(zip(all_ab, quant_ab))\n         seqDic_ba = dict(zip(all_ba, quant_ba))\n \n-        seq_mut = numpy.array(mut_array[:, 1])\n+        if re.search(\'^(\\d)+_(\\d)+\', str(mut_array[0,0])) is None:\n+            seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)\n+            group = mut_array[seqMut_index,0]\n+            alt_group = mut_array[seqMut_index,2]\n+            mut_array = mut_array[seqMut_index,:]\n \n         groupUnique, group_index = numpy.unique(group, return_index=True)\n         groupUnique = groupUnique[numpy.argsort(group_index)]\n-\n         lst_ab = []\n+        lst_ba = []\n         for i in seq_mut:\n             lst_ab.append(seqDic_ab.get(i))\n-\n-        lst_ba = []\n-        for i in seq_mut:\n             lst_ba.append(seqDic_ba.get(i))\n \n         quant_ab = numpy.array(lst_ab)\n         quant_ba = numpy.array(lst_ba)\n \n-        quantAfterRegion = []\n+        quantAfterRegion = OrderedDict()\n+        for key in groupUnique:\n+            quantAfterRegion[key] = []\n+\n         for i in groupUnique:\n-            dataAB = quant_ab[numpy.where(group == i)[0]]\n-            dataBA = quant_ba[numpy.where(group == i)[0]]\n+            index_of_current_region = numpy.where(group == i)[0]\n+            quant_ba_i = quant_ba[index_of_current_region]\n+            alt_group_i = alt_group[index_of_current_region]\n+            index_alternative_refs = numpy.where(alt_group_i != "=")[0]\n+\n+            dataAB = quant_ab[index_of_current_region]\n             bigFamilies = numpy.where(dataAB > 20)[0]\n             dataAB[bigFamilies] = 22\n-            bigFamilies = numpy.where(dataBA > 20)[0]\n-            dataBA[bigFamilies] = 22\n+            for el in dataAB:\n+                quantAfterRegion[i].append(el)\n+\n+            if len(index_alternative_refs) == 0:\n+                dataBA = quant_ba_i\n+                bigFamilies = numpy.where(dataBA > 20)[0]\n+                dataBA[bigFamilies] = 22\n+                for el2 in dataBA:\n+                    quantAfterRegion[i].append(el2)\n+            else:  # get tags where 2nd mate is aligned to a different ref genome\n+                unique_alt = numpy.unique(alt_group_i[index_alternative_refs])\n+                for alt in unique_alt:\n+                    ind_alt_tags = numpy.where(alt_group_i == alt)[0]\n+                    dataBA = quant_ba_i[ind_alt_tags]\n \n-            quantAll = numpy.concatenate((dataAB, dataBA))\n-            quantAfterRegion.append(quantAll)\n+                    bigFamilies = numpy.where(dataBA > 20)[0]\n+                    if len(bigFamilies) != 0:\n+                        if len(bigFamilies) == 1 and type(dataBA) != list:\n+                            dataBA = 22\n+                            quantAfterRegion[alt].append(dataBA)\n+                        else:\n+                            dataBA[bigFamilies] = 22\n+                            for el3 in dataBA:\n+                                quantAfterRegion[alt].append(el3)\n \n+                index_inverse = [x for x in range(0, len(index_of_curre'..b',} to {}".format(len(ind_alt_tags), alt)\n+                    lengths_of_alt_aligned_tags.append(name)\n+                ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0]\n+                name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i)\n+                lengths_of_alt_aligned_tags.append(name_inverse)\n+                plt.text(0.78, 0.14 - s, "{}\\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure)\n+                lengths_array_ab.append(nr_tags_ab)\n+                lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags))\n+            else:\n+                plt.text(0.78, 0.14 - s, "=\\n", size=11,transform=plt.gcf().transFigure)\n+                lengths_array_ab.append(nr_tags_ab)\n+                lengths_array_ba.append(nr_tags_ab)\n+            index_array += 1\n \n         plt.legend(loc=\'upper right\', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)\n-        # plt.title(name1, fontsize=14)\n         plt.xlabel("Family size", fontsize=14)\n         plt.ylabel("Absolute Frequency", fontsize=14)\n         plt.grid(b=True, which="major", color="#424242", linestyle=":")\n         plt.margins(0.01, None)\n \n-        # plt.savefig("{}_regions.pdf".format(title_file), bbox_inch="tight")\n         pdf.savefig(fig, bbox_inch="tight")\n         plt.close()\n \n@@ -167,6 +243,8 @@\n         output_file.write("absolute frequency:{}{}{}{}\\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))\n         output_file.write("relative frequency:{}{:.3f}{}{:.3f}\\n\\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))\n         output_file.write("total nr. of reads{}{}\\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))\n+        output_file.write("total nr. of tags{}{}\\n".format(sep, length_regions))\n+\n         output_file.write("\\n\\nValues from family size distribution\\n")\n         output_file.write("{}".format(sep))\n         for i in groupUnique:\n@@ -179,23 +257,29 @@\n             else:\n                 fs = "={}".format(fs)\n             output_file.write("FS{}{}".format(fs, sep))\n-            for n in range(len(groupUnique)):\n-                output_file.write("{}{}".format(int(counts[0][n][j]), sep))\n+\n+            if len(groupUnique) == 1:\n+                output_file.write("{}{}".format(int(counts[0][j]), sep))\n+            else:\n+                for n in range(len(groupUnique)):\n+                    output_file.write("{}{}".format(int(counts[0][n][j]), sep))\n+\n             output_file.write("\\n")\n             j += 1\n         output_file.write("sum{}".format(sep))\n-        for i in counts[0]:\n-            output_file.write("{}{}".format(int(sum(i)), sep))\n+        if len(groupUnique) == 1:\n+            output_file.write("{}{}".format(int(sum(counts[0])), sep))\n+        else:\n+            for i in counts[0]:\n+                output_file.write("{}{}".format(int(sum(i)), sep))\n         output_file.write("\\n")\n-        output_file.write("\\n\\nIn the plot, both family sizes of the ab and ba strands were used.\\nWhereas the total numbers indicate only the single count of the tags per region.\\n")\n-        output_file.write("Region{}total nr. of tags per region\\n".format(sep))\n-        for i, count in zip(groupUnique, quantAfterRegion):\n-            output_file.write("{}{}{}\\n".format(i, sep, len(count) / 2))\n-        output_file.write("sum of tags{}{}\\n".format(sep, length_regions))\n+        output_file.write("\\n\\nRegion{}total nr. of ab{}ba tags\\n".format(sep, sep))\n+\n+        for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique):\n+            output_file.write("{}{}{}{}{}\\n".format(i, sep, ab, sep, ba))\n \n     print("Files successfully created!")\n-    # print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd()))\n \n \n if __name__ == \'__main__\':\n-    sys.exit(compare_read_families_refGenome(sys.argv))\n+   sys.exit(compare_read_families_refGenome(sys.argv))\n'