| Previous changeset 10:04c544617b44 (2018-11-26) Next changeset 12:fa3ff1c5280d (2018-11-26) |
|
Commit message:
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 2aea9e30f5ed4fd3db3fb44ddb8aacb48a62eccc |
|
modified:
fsd_regions.py |
| b |
| diff -r 04c544617b44 -r 37db9decb5d0 fsd_regions.py --- a/fsd_regions.py Mon Nov 26 04:42:25 2018 -0500 +++ b/fsd_regions.py Mon Nov 26 04:51:11 2018 -0500 |
| [ |
| b'@@ -1,220 +1,270 @@\n-#!/usr/bin/env python\n-\n-# Family size distribution of tags which were aligned to the reference genome\n-#\n-# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)\n-# Contact: monika.heinzl@edumail.at\n-#\n-# Takes at least one TABULAR file with tags before the alignment to the SSCS\n-# and a TXT with tags of reads that overlap the regions of the reference genome as input.\n-# The program produces a plot which shows the distribution of family sizes of the tags from the input files and\n-# a tabular file with the data of the plot.\n-\n-# USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf\n-\n-import argparse\n-import re\n-import sys\n-from collections import OrderedDict\n-\n-import matplotlib.pyplot as plt\n-import numpy\n-from matplotlib.backends.backend_pdf import PdfPages\n-\n-plt.switch_backend(\'agg\')\n-\n-\n-def readFileReferenceFree(file, delim):\n- with open(file, \'r\') as dest_f:\n- data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments=\'#\', dtype=\'string\')\n- return(data_array)\n-\n-\n-def make_argparser():\n- parser = argparse.ArgumentParser(description=\'Family Size Distribution of tags which were aligned to regions of the reference genome\')\n- parser.add_argument(\'--inputFile\', help=\'Tabular File with three columns: ab or ba, tag and family size.\')\n- parser.add_argument(\'--inputName1\')\n- parser.add_argument(\'--ref_genome\', help=\'TXT File with tags of reads that overlap the region.\')\n- parser.add_argument(\'--output_pdf\', default="data.pdf", type=str, help=\'Name of the pdf and tabular file.\')\n- parser.add_argument(\'--output_tabular\', default="data.tabular", type=str, help=\'Name of the pdf and tabular file.\')\n- return parser\n-\n-\n-def compare_read_families_refGenome(argv):\n- parser = make_argparser()\n- args = parser.parse_args(argv[1:])\n-\n- firstFile = args.inputFile\n- name1 = args.inputName1\n- name1 = name1.split(".tabular")[0]\n- refGenome = args.ref_genome\n- title_file = args.output_pdf\n- title_file2 = args.output_tabular\n- sep = "\\t"\n-\n- with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:\n- data_array = readFileReferenceFree(firstFile, "\\t")\n-\n- mut_array = readFileReferenceFree(refGenome, " ")\n- group = numpy.array(mut_array[:, 0])\n- seq_mut = numpy.array(mut_array[:, 1])\n-\n- seq = numpy.array(data_array[:, 1])\n- tags = numpy.array(data_array[:, 2])\n- quant = numpy.array(data_array[:, 0]).astype(int)\n-\n- all_ab = seq[numpy.where(tags == "ab")[0]]\n- all_ba = seq[numpy.where(tags == "ba")[0]]\n- quant_ab = quant[numpy.where(tags == "ab")[0]]\n- quant_ba = quant[numpy.where(tags == "ba")[0]]\n-\n- seqDic_ab = dict(zip(all_ab, quant_ab))\n- seqDic_ba = dict(zip(all_ba, quant_ba))\n-\n- if re.search(\'_(\\d)+_(\\d)+$\', str(mut_array[0,0])) is None:\n- seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)\n- group = mut_array[seqMut_index,0]\n- mut_array = mut_array[seqMut_index,:]\n- length_regions = len(seq_mut)*2\n-\n- groupUnique, group_index = numpy.unique(group, return_index=True)\n- groupUnique = groupUnique[numpy.argsort(group_index)]\n-\n- lst_ab = []\n- lst_ba = []\n- for i in seq_mut:\n- lst_ab.append(seqDic_ab.get(i))\n- lst_ba.append(seqDic_ba.get(i))\n-\n- quant_ab = numpy.array(lst_ab)\n- quant_ba = numpy.array(lst_ba)\n-\n- quantAfterRegion = []\n-\n- for i in groupUnique:\n- dataAB = quant_ab[numpy.where(group == i)[0]]\n- dataBA = quant_ba[numpy.where(group == i)[0]]\n- bigFamilies = numpy.where(dataAB > 20)[0]\n- dataAB[bigFamilies] = 22\n- bigFamilies = numpy.where(dataBA > 20)['..b', transform=plt.gcf().transFigure)\r\n+\r\n+ count2 = np.bincount(map(int, quant_ba)) # original counts\r\n+\r\n+ legend = "BA\\n{}\\n{}\\n{:.5f}" \\\r\n+ .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))\r\n+ plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)\r\n+\r\n+ plt.text(0.55, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure)\r\n+ plt.text(0.8, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11,\r\n+ transform=plt.gcf().transFigure)\r\n+\r\n+ legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\\nWhereas the total numbers indicate only the single count of the tags per region.\\n"\r\n+ plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure)\r\n+\r\n+ space = 0\r\n+ for i, count in zip(group, quantAfterRegion):\r\n+ plt.text(0.55, 0.15 - space, "{}:\\n".format(i), size=11, transform=plt.gcf().transFigure)\r\n+ plt.text(0.8, 0.15 - space, "{:,}\\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure)\r\n+ space = space + 0.02\r\n+\r\n+ plt.legend(loc=\'upper right\', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)\r\n+ plt.xlabel("Family size", fontsize=14)\r\n+ plt.ylabel("Absolute Frequency", fontsize=14)\r\n+ plt.grid(b=True, which="major", color="#424242", linestyle=":")\r\n+ plt.margins(0.01, None)\r\n+\r\n+ pdf.savefig(fig, bbox_inch="tight")\r\n+ plt.close()\r\n+\r\n+ output_file.write("Dataset:{}{}\\n".format(sep, name1))\r\n+ output_file.write("{}AB{}BA\\n".format(sep, sep))\r\n+ output_file.write("max. family size:{}{}{}{}\\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))\r\n+ output_file.write("absolute frequency:{}{}{}{}\\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))\r\n+ output_file.write("relative frequency:{}{:.3f}{}{:.3f}\\n\\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))\r\n+ output_file.write("total nr. of reads{}{}\\n".format(sep, sum(np.array(data_array[:, 0]).astype(int))))\r\n+ output_file.write("total nr. of tags{}{} ({})\\n".format(sep, length_regions, length_regions / 2))\r\n+ output_file.write("\\n\\nValues from family size distribution\\n")\r\n+ output_file.write("{}".format(sep))\r\n+ for i in group:\r\n+ output_file.write("{}{}".format(i, sep))\r\n+ output_file.write("\\n")\r\n+\r\n+ j = 0\r\n+ for fs in counts[1][0:len(counts[1]) - 1]:\r\n+ if fs == 21:\r\n+ fs = ">20"\r\n+ else:\r\n+ fs = "={}".format(fs)\r\n+ output_file.write("FS{}{}".format(fs, sep))\r\n+ if len(group) == 1:\r\n+ output_file.write("{}{}".format(int(counts[0][j]), sep))\r\n+ else:\r\n+ for n in range(len(group)):\r\n+ output_file.write("{}{}".format(int(counts[0][n][j]), sep))\r\n+ output_file.write("\\n")\r\n+ j += 1\r\n+ output_file.write("sum{}".format(sep))\r\n+\r\n+ if len(group) == 1:\r\n+ output_file.write("{}{}".format(int(sum(counts[0])), sep))\r\n+ else:\r\n+ for i in counts[0]:\r\n+ output_file.write("{}{}".format(int(sum(i)), sep))\r\n+ output_file.write("\\n")\r\n+ output_file.write("\\n\\nIn the plot, both family sizes of the ab and ba strands were used.\\nWhereas the total numbers indicate only the single count of the tags per region.\\n")\r\n+ output_file.write("Region{}total nr. of tags per region\\n".format(sep))\r\n+ for i, count in zip(group, quantAfterRegion):\r\n+ output_file.write("{}{}{}\\n".format(i, sep, len(count) / 2))\r\n+\r\n+ print("Files successfully created!")\r\n+\r\n+\r\n+if __name__ == \'__main__\':\r\n+ sys.exit(compare_read_families_refGenome(sys.argv))\r\n' |