Repository 'fsd_regions'
hg clone https://toolshed.g2.bx.psu.edu/repos/mheinzl/fsd_regions

Changeset 11:37db9decb5d0 (2018-11-26)
Previous changeset 10:04c544617b44 (2018-11-26) Next changeset 12:fa3ff1c5280d (2018-11-26)
Commit message:
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 2aea9e30f5ed4fd3db3fb44ddb8aacb48a62eccc
modified:
fsd_regions.py
b
diff -r 04c544617b44 -r 37db9decb5d0 fsd_regions.py
--- a/fsd_regions.py Mon Nov 26 04:42:25 2018 -0500
+++ b/fsd_regions.py Mon Nov 26 04:51:11 2018 -0500
[
b'@@ -1,220 +1,270 @@\n-#!/usr/bin/env python\n-\n-# Family size distribution of tags which were aligned to the reference genome\n-#\n-# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)\n-# Contact: monika.heinzl@edumail.at\n-#\n-# Takes at least one TABULAR file with tags before the alignment to the SSCS\n-# and a TXT with tags of reads that overlap the regions of the reference genome as input.\n-# The program produces a plot which shows the distribution of family sizes of the tags from the input files and\n-# a tabular file with the data of the plot.\n-\n-# USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome  filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf\n-\n-import argparse\n-import re\n-import sys\n-from collections import OrderedDict\n-\n-import matplotlib.pyplot as plt\n-import numpy\n-from matplotlib.backends.backend_pdf import PdfPages\n-\n-plt.switch_backend(\'agg\')\n-\n-\n-def readFileReferenceFree(file, delim):\n-    with open(file, \'r\') as dest_f:\n-        data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments=\'#\', dtype=\'string\')\n-        return(data_array)\n-\n-\n-def make_argparser():\n-    parser = argparse.ArgumentParser(description=\'Family Size Distribution of tags which were aligned to regions of the reference genome\')\n-    parser.add_argument(\'--inputFile\', help=\'Tabular File with three columns: ab or ba, tag and family size.\')\n-    parser.add_argument(\'--inputName1\')\n-    parser.add_argument(\'--ref_genome\', help=\'TXT File with tags of reads that overlap the region.\')\n-    parser.add_argument(\'--output_pdf\', default="data.pdf", type=str, help=\'Name of the pdf and tabular file.\')\n-    parser.add_argument(\'--output_tabular\', default="data.tabular", type=str, help=\'Name of the pdf and tabular file.\')\n-    return parser\n-\n-\n-def compare_read_families_refGenome(argv):\n-    parser = make_argparser()\n-    args = parser.parse_args(argv[1:])\n-\n-    firstFile = args.inputFile\n-    name1 = args.inputName1\n-    name1 = name1.split(".tabular")[0]\n-    refGenome = args.ref_genome\n-    title_file = args.output_pdf\n-    title_file2 = args.output_tabular\n-    sep = "\\t"\n-\n-    with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:\n-        data_array = readFileReferenceFree(firstFile, "\\t")\n-\n-        mut_array = readFileReferenceFree(refGenome, " ")\n-        group = numpy.array(mut_array[:, 0])\n-        seq_mut = numpy.array(mut_array[:, 1])\n-\n-        seq = numpy.array(data_array[:, 1])\n-        tags = numpy.array(data_array[:, 2])\n-        quant = numpy.array(data_array[:, 0]).astype(int)\n-\n-        all_ab = seq[numpy.where(tags == "ab")[0]]\n-        all_ba = seq[numpy.where(tags == "ba")[0]]\n-        quant_ab = quant[numpy.where(tags == "ab")[0]]\n-        quant_ba = quant[numpy.where(tags == "ba")[0]]\n-\n-        seqDic_ab = dict(zip(all_ab, quant_ab))\n-        seqDic_ba = dict(zip(all_ba, quant_ba))\n-\n-        if re.search(\'_(\\d)+_(\\d)+$\', str(mut_array[0,0])) is None:\n-            seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)\n-            group = mut_array[seqMut_index,0]\n-            mut_array = mut_array[seqMut_index,:]\n-        length_regions = len(seq_mut)*2\n-\n-        groupUnique, group_index = numpy.unique(group, return_index=True)\n-        groupUnique = groupUnique[numpy.argsort(group_index)]\n-\n-        lst_ab = []\n-        lst_ba = []\n-        for i in seq_mut:\n-            lst_ab.append(seqDic_ab.get(i))\n-            lst_ba.append(seqDic_ba.get(i))\n-\n-        quant_ab = numpy.array(lst_ab)\n-        quant_ba = numpy.array(lst_ba)\n-\n-        quantAfterRegion = []\n-\n-        for i in groupUnique:\n-            dataAB = quant_ab[numpy.where(group == i)[0]]\n-            dataBA = quant_ba[numpy.where(group == i)[0]]\n-            bigFamilies = numpy.where(dataAB > 20)[0]\n-            dataAB[bigFamilies] = 22\n-            bigFamilies = numpy.where(dataBA > 20)['..b', transform=plt.gcf().transFigure)\r\n+\r\n+        count2 = np.bincount(map(int, quant_ba))  # original counts\r\n+\r\n+        legend = "BA\\n{}\\n{}\\n{:.5f}" \\\r\n+            .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))\r\n+        plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)\r\n+\r\n+        plt.text(0.55, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure)\r\n+        plt.text(0.8, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11,\r\n+                 transform=plt.gcf().transFigure)\r\n+\r\n+        legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\\nWhereas the total numbers indicate only the single count of the tags per region.\\n"\r\n+        plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure)\r\n+\r\n+        space = 0\r\n+        for i, count in zip(group, quantAfterRegion):\r\n+            plt.text(0.55, 0.15 - space, "{}:\\n".format(i), size=11, transform=plt.gcf().transFigure)\r\n+            plt.text(0.8, 0.15 - space, "{:,}\\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)\r\n+            space = space + 0.02\r\n+\r\n+        plt.legend(loc=\'upper right\', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)\r\n+        plt.xlabel("Family size", fontsize=14)\r\n+        plt.ylabel("Absolute Frequency", fontsize=14)\r\n+        plt.grid(b=True, which="major", color="#424242", linestyle=":")\r\n+        plt.margins(0.01, None)\r\n+\r\n+        pdf.savefig(fig, bbox_inch="tight")\r\n+        plt.close()\r\n+\r\n+        output_file.write("Dataset:{}{}\\n".format(sep, name1))\r\n+        output_file.write("{}AB{}BA\\n".format(sep, sep))\r\n+        output_file.write("max. family size:{}{}{}{}\\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))\r\n+        output_file.write("absolute frequency:{}{}{}{}\\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))\r\n+        output_file.write("relative frequency:{}{:.3f}{}{:.3f}\\n\\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))\r\n+        output_file.write("total nr. of reads{}{}\\n".format(sep, sum(np.array(data_array[:, 0]).astype(int))))\r\n+        output_file.write("total nr. of tags{}{} ({})\\n".format(sep, length_regions, length_regions / 2))\r\n+        output_file.write("\\n\\nValues from family size distribution\\n")\r\n+        output_file.write("{}".format(sep))\r\n+        for i in group:\r\n+            output_file.write("{}{}".format(i, sep))\r\n+        output_file.write("\\n")\r\n+\r\n+        j = 0\r\n+        for fs in counts[1][0:len(counts[1]) - 1]:\r\n+            if fs == 21:\r\n+                fs = ">20"\r\n+            else:\r\n+                fs = "={}".format(fs)\r\n+            output_file.write("FS{}{}".format(fs, sep))\r\n+            if len(group) == 1:\r\n+                output_file.write("{}{}".format(int(counts[0][j]), sep))\r\n+            else:\r\n+                for n in range(len(group)):\r\n+                    output_file.write("{}{}".format(int(counts[0][n][j]), sep))\r\n+            output_file.write("\\n")\r\n+            j += 1\r\n+        output_file.write("sum{}".format(sep))\r\n+\r\n+        if len(group) == 1:\r\n+            output_file.write("{}{}".format(int(sum(counts[0])), sep))\r\n+        else:\r\n+            for i in counts[0]:\r\n+                output_file.write("{}{}".format(int(sum(i)), sep))\r\n+        output_file.write("\\n")\r\n+        output_file.write("\\n\\nIn the plot, both family sizes of the ab and ba strands were used.\\nWhereas the total numbers indicate only the single count of the tags per region.\\n")\r\n+        output_file.write("Region{}total nr. of tags per region\\n".format(sep))\r\n+        for i, count in zip(group, quantAfterRegion):\r\n+            output_file.write("{}{}{}\\n".format(i, sep, len(count) / 2))\r\n+\r\n+    print("Files successfully created!")\r\n+\r\n+\r\n+if __name__ == \'__main__\':\r\n+    sys.exit(compare_read_families_refGenome(sys.argv))\r\n'