Previous changeset 4:b202c97deabe (2018-10-08) Next changeset 6:26014c24323a (2018-10-26) |
Commit message:
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty |
modified:
fsd_regions.py |
b |
diff -r b202c97deabe -r 52454637bc45 fsd_regions.py --- a/fsd_regions.py Mon Oct 08 05:53:50 2018 -0400 +++ b/fsd_regions.py Wed Oct 17 05:23:33 2018 -0400 |
[ |
b'@@ -13,7 +13,9 @@\n # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf\n \n import argparse\n+import re\n import sys\n+from collections import OrderedDict\n \n import matplotlib.pyplot as plt\n import numpy\n@@ -55,11 +57,13 @@\n \n mut_array = readFileReferenceFree(refGenome, " ")\n length_regions = len(mut_array)\n+ group = numpy.array(mut_array[:, 0])\n+ seq_mut = numpy.array(mut_array[:, 1])\n+ alt_group = numpy.array(mut_array[:, 2])\n \n seq = numpy.array(data_array[:, 1])\n tags = numpy.array(data_array[:, 2])\n quant = numpy.array(data_array[:, 0]).astype(int)\n- group = numpy.array(mut_array[:, 0])\n \n all_ab = seq[numpy.where(tags == "ab")[0]]\n all_ba = seq[numpy.where(tags == "ba")[0]]\n@@ -69,34 +73,75 @@\n seqDic_ab = dict(zip(all_ab, quant_ab))\n seqDic_ba = dict(zip(all_ba, quant_ba))\n \n- seq_mut = numpy.array(mut_array[:, 1])\n+ if re.search(\'^(\\d)+_(\\d)+\', str(mut_array[0,0])) is None:\n+ seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)\n+ group = mut_array[seqMut_index,0]\n+ alt_group = mut_array[seqMut_index,2]\n+ mut_array = mut_array[seqMut_index,:]\n \n groupUnique, group_index = numpy.unique(group, return_index=True)\n groupUnique = groupUnique[numpy.argsort(group_index)]\n-\n lst_ab = []\n+ lst_ba = []\n for i in seq_mut:\n lst_ab.append(seqDic_ab.get(i))\n-\n- lst_ba = []\n- for i in seq_mut:\n lst_ba.append(seqDic_ba.get(i))\n \n quant_ab = numpy.array(lst_ab)\n quant_ba = numpy.array(lst_ba)\n \n- quantAfterRegion = []\n+ quantAfterRegion = OrderedDict()\n+ for key in groupUnique:\n+ quantAfterRegion[key] = []\n+\n for i in groupUnique:\n- dataAB = quant_ab[numpy.where(group == i)[0]]\n- dataBA = quant_ba[numpy.where(group == i)[0]]\n+ index_of_current_region = numpy.where(group == i)[0]\n+ quant_ba_i = quant_ba[index_of_current_region]\n+ alt_group_i = alt_group[index_of_current_region]\n+ index_alternative_refs = numpy.where(alt_group_i != "=")[0]\n+\n+ dataAB = quant_ab[index_of_current_region]\n bigFamilies = numpy.where(dataAB > 20)[0]\n dataAB[bigFamilies] = 22\n- bigFamilies = numpy.where(dataBA > 20)[0]\n- dataBA[bigFamilies] = 22\n+ for el in dataAB:\n+ quantAfterRegion[i].append(el)\n+\n+ if len(index_alternative_refs) == 0:\n+ dataBA = quant_ba_i\n+ bigFamilies = numpy.where(dataBA > 20)[0]\n+ dataBA[bigFamilies] = 22\n+ for el2 in dataBA:\n+ quantAfterRegion[i].append(el2)\n+ else: # get tags where 2nd mate is aligned to a different ref genome\n+ unique_alt = numpy.unique(alt_group_i[index_alternative_refs])\n+ for alt in unique_alt:\n+ ind_alt_tags = numpy.where(alt_group_i == alt)[0]\n+ dataBA = quant_ba_i[ind_alt_tags]\n \n- quantAll = numpy.concatenate((dataAB, dataBA))\n- quantAfterRegion.append(quantAll)\n+ bigFamilies = numpy.where(dataBA > 20)[0]\n+ if len(bigFamilies) != 0:\n+ if len(bigFamilies) == 1 and type(dataBA) != list:\n+ dataBA = 22\n+ quantAfterRegion[alt].append(dataBA)\n+ else:\n+ dataBA[bigFamilies] = 22\n+ for el3 in dataBA:\n+ quantAfterRegion[alt].append(el3)\n \n+ index_inverse = [x for x in range(0, len(index_of_curre'..b',} to {}".format(len(ind_alt_tags), alt)\n+ lengths_of_alt_aligned_tags.append(name)\n+ ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0]\n+ name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i)\n+ lengths_of_alt_aligned_tags.append(name_inverse)\n+ plt.text(0.78, 0.14 - s, "{}\\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure)\n+ lengths_array_ab.append(nr_tags_ab)\n+ lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags))\n+ else:\n+ plt.text(0.78, 0.14 - s, "=\\n", size=11,transform=plt.gcf().transFigure)\n+ lengths_array_ab.append(nr_tags_ab)\n+ lengths_array_ba.append(nr_tags_ab)\n+ index_array += 1\n \n plt.legend(loc=\'upper right\', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)\n- # plt.title(name1, fontsize=14)\n plt.xlabel("Family size", fontsize=14)\n plt.ylabel("Absolute Frequency", fontsize=14)\n plt.grid(b=True, which="major", color="#424242", linestyle=":")\n plt.margins(0.01, None)\n \n- # plt.savefig("{}_regions.pdf".format(title_file), bbox_inch="tight")\n pdf.savefig(fig, bbox_inch="tight")\n plt.close()\n \n@@ -167,6 +243,8 @@\n output_file.write("absolute frequency:{}{}{}{}\\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))\n output_file.write("relative frequency:{}{:.3f}{}{:.3f}\\n\\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))\n output_file.write("total nr. of reads{}{}\\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))\n+ output_file.write("total nr. of tags{}{}\\n".format(sep, length_regions))\n+\n output_file.write("\\n\\nValues from family size distribution\\n")\n output_file.write("{}".format(sep))\n for i in groupUnique:\n@@ -179,23 +257,29 @@\n else:\n fs = "={}".format(fs)\n output_file.write("FS{}{}".format(fs, sep))\n- for n in range(len(groupUnique)):\n- output_file.write("{}{}".format(int(counts[0][n][j]), sep))\n+\n+ if len(groupUnique) == 1:\n+ output_file.write("{}{}".format(int(counts[0][j]), sep))\n+ else:\n+ for n in range(len(groupUnique)):\n+ output_file.write("{}{}".format(int(counts[0][n][j]), sep))\n+\n output_file.write("\\n")\n j += 1\n output_file.write("sum{}".format(sep))\n- for i in counts[0]:\n- output_file.write("{}{}".format(int(sum(i)), sep))\n+ if len(groupUnique) == 1:\n+ output_file.write("{}{}".format(int(sum(counts[0])), sep))\n+ else:\n+ for i in counts[0]:\n+ output_file.write("{}{}".format(int(sum(i)), sep))\n output_file.write("\\n")\n- output_file.write("\\n\\nIn the plot, both family sizes of the ab and ba strands were used.\\nWhereas the total numbers indicate only the single count of the tags per region.\\n")\n- output_file.write("Region{}total nr. of tags per region\\n".format(sep))\n- for i, count in zip(groupUnique, quantAfterRegion):\n- output_file.write("{}{}{}\\n".format(i, sep, len(count) / 2))\n- output_file.write("sum of tags{}{}\\n".format(sep, length_regions))\n+ output_file.write("\\n\\nRegion{}total nr. of ab{}ba tags\\n".format(sep, sep))\n+\n+ for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique):\n+ output_file.write("{}{}{}{}{}\\n".format(i, sep, ab, sep, ba))\n \n print("Files successfully created!")\n- # print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd()))\n \n \n if __name__ == \'__main__\':\n- sys.exit(compare_read_families_refGenome(sys.argv))\n+ sys.exit(compare_read_families_refGenome(sys.argv))\n' |