| Previous changeset 5:52454637bc45 (2018-10-17) Next changeset 7:3b8a0e462021 (2018-11-20) |
|
Commit message:
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty |
|
modified:
fsd_regions.py test-data/Test_data_regions.txt test-data/output_file.pdf test-data/output_file.tabular |
| b |
| diff -r 52454637bc45 -r 26014c24323a fsd_regions.py --- a/fsd_regions.py Wed Oct 17 05:23:33 2018 -0400 +++ b/fsd_regions.py Fri Oct 26 07:54:03 2018 -0400 |
| [ |
| b'@@ -56,10 +56,8 @@\n data_array = readFileReferenceFree(firstFile, "\\t")\n \n mut_array = readFileReferenceFree(refGenome, " ")\n- length_regions = len(mut_array)\n group = numpy.array(mut_array[:, 0])\n seq_mut = numpy.array(mut_array[:, 1])\n- alt_group = numpy.array(mut_array[:, 2])\n \n seq = numpy.array(data_array[:, 1])\n tags = numpy.array(data_array[:, 2])\n@@ -73,14 +71,15 @@\n seqDic_ab = dict(zip(all_ab, quant_ab))\n seqDic_ba = dict(zip(all_ba, quant_ba))\n \n- if re.search(\'^(\\d)+_(\\d)+\', str(mut_array[0,0])) is None:\n+ if re.search(\'_(\\d)+_(\\d)+$\', str(mut_array[0,0])) is None:\n seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)\n group = mut_array[seqMut_index,0]\n- alt_group = mut_array[seqMut_index,2]\n mut_array = mut_array[seqMut_index,:]\n+ length_regions = len(seq_mut)*2\n \n groupUnique, group_index = numpy.unique(group, return_index=True)\n groupUnique = groupUnique[numpy.argsort(group_index)]\n+\n lst_ab = []\n lst_ba = []\n for i in seq_mut:\n@@ -90,58 +89,19 @@\n quant_ab = numpy.array(lst_ab)\n quant_ba = numpy.array(lst_ba)\n \n- quantAfterRegion = OrderedDict()\n- for key in groupUnique:\n- quantAfterRegion[key] = []\n+ quantAfterRegion = []\n \n for i in groupUnique:\n- index_of_current_region = numpy.where(group == i)[0]\n- quant_ba_i = quant_ba[index_of_current_region]\n- alt_group_i = alt_group[index_of_current_region]\n- index_alternative_refs = numpy.where(alt_group_i != "=")[0]\n-\n- dataAB = quant_ab[index_of_current_region]\n+ dataAB = quant_ab[numpy.where(group == i)[0]]\n+ dataBA = quant_ba[numpy.where(group == i)[0]]\n bigFamilies = numpy.where(dataAB > 20)[0]\n dataAB[bigFamilies] = 22\n- for el in dataAB:\n- quantAfterRegion[i].append(el)\n-\n- if len(index_alternative_refs) == 0:\n- dataBA = quant_ba_i\n- bigFamilies = numpy.where(dataBA > 20)[0]\n- dataBA[bigFamilies] = 22\n- for el2 in dataBA:\n- quantAfterRegion[i].append(el2)\n- else: # get tags where 2nd mate is aligned to a different ref genome\n- unique_alt = numpy.unique(alt_group_i[index_alternative_refs])\n- for alt in unique_alt:\n- ind_alt_tags = numpy.where(alt_group_i == alt)[0]\n- dataBA = quant_ba_i[ind_alt_tags]\n+ bigFamilies = numpy.where(dataBA > 20)[0]\n+ dataBA[bigFamilies] = 22\n \n- bigFamilies = numpy.where(dataBA > 20)[0]\n- if len(bigFamilies) != 0:\n- if len(bigFamilies) == 1 and type(dataBA) != list:\n- dataBA = 22\n- quantAfterRegion[alt].append(dataBA)\n- else:\n- dataBA[bigFamilies] = 22\n- for el3 in dataBA:\n- quantAfterRegion[alt].append(el3)\n+ quantAll = numpy.concatenate((dataAB, dataBA))\n+ quantAfterRegion.append(quantAll)\n \n- index_inverse = [x for x in range(0, len(index_of_current_region)) if x not in index_alternative_refs]\n- data_BA_other = quant_ba_i[index_inverse]\n- bigFamilies_other = numpy.where(data_BA_other > 20)[0]\n-\n- if len(bigFamilies_other) != 0:\n- if len(bigFamilies_other) == 1 and type(data_BA_other) != list:\n- data_BA_other = 22\n- quantAfterRegion[i].append(data_BA_other)\n- else:\n- data_BA_other[bigFamilies_other] = 22\n- for el3 in data'..b'=11, transform=plt.gcf().transFigure)\n+ #space = numpy.arange(0, len(groupUnique), 0.02)\n+ s = 0\n index_array = 0\n- for i, s, count in zip(groupUnique, space, quantAfterRegion):\n+ for i, count in zip(groupUnique, quantAfterRegion):\n index_of_current_region = numpy.where(group == i)[0]\n-\n plt.text(0.55, 0.14 - s, "{}=\\n".format(i), size=11, transform=plt.gcf().transFigure)\n- if re.search(\'^(\\d)+_(\\d)+\', str(mut_array[0, 0])) is None:\n+ if re.search(\'_(\\d)+_(\\d)+$\', str(mut_array[0, 0])) is None:\n nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1]))\n else:\n nr_tags_ab = len(mut_array[index_of_current_region, 1])\n-\n- plt.text(0.7, 0.14 - s, "{:,}\\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure)\n-\n- alt_group_i = alt_group[index_of_current_region]\n- alternative = numpy.where(alt_group_i != "=")[0]\n- unique_alt = numpy.unique(alt_group_i[alternative])\n- lengths_of_alt_aligned_tags = []\n- if len(alternative) != 0:\n- for alt in unique_alt:\n- ind_alt_tags = numpy.where(alt_group_i == alt)[0]\n- name = "{:,} to {}".format(len(ind_alt_tags), alt)\n- lengths_of_alt_aligned_tags.append(name)\n- ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0]\n- name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i)\n- lengths_of_alt_aligned_tags.append(name_inverse)\n- plt.text(0.78, 0.14 - s, "{}\\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure)\n- lengths_array_ab.append(nr_tags_ab)\n- lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags))\n- else:\n- plt.text(0.78, 0.14 - s, "=\\n", size=11,transform=plt.gcf().transFigure)\n- lengths_array_ab.append(nr_tags_ab)\n- lengths_array_ba.append(nr_tags_ab)\n- index_array += 1\n+ plt.text(0.75, 0.14 - s, "{:,}\\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure)\n+ s = s + 0.02\n \n plt.legend(loc=\'upper right\', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)\n plt.xlabel("Family size", fontsize=14)\n@@ -243,7 +178,7 @@\n output_file.write("absolute frequency:{}{}{}{}\\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))\n output_file.write("relative frequency:{}{:.3f}{}{:.3f}\\n\\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))\n output_file.write("total nr. of reads{}{}\\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))\n- output_file.write("total nr. of tags{}{}\\n".format(sep, length_regions))\n+ output_file.write("total nr. of tags{}{} ({})\\n".format(sep, length_regions, length_regions/2))\n \n output_file.write("\\n\\nValues from family size distribution\\n")\n output_file.write("{}".format(sep))\n@@ -273,11 +208,11 @@\n for i in counts[0]:\n output_file.write("{}{}".format(int(sum(i)), sep))\n output_file.write("\\n")\n- output_file.write("\\n\\nRegion{}total nr. of ab{}ba tags\\n".format(sep, sep))\n+ output_file.write("\\n\\nIn the plot, both family sizes of the ab and ba strands were used.\\nWhereas the total numbers indicate only the count of the tags per region.\\n")\n+ output_file.write("\\n\\nRegion{}total nr. of tags per region\\n".format(sep, sep))\n \n- for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique):\n- output_file.write("{}{}{}{}{}\\n".format(i, sep, ab, sep, ba))\n-\n+ for i, count in zip(groupUnique, quantAfterRegion):\n+ output_file.write("{}{}{}\\n".format(i,sep,len(count) / 2))\n print("Files successfully created!")\n \n \n' |
| b |
| diff -r 52454637bc45 -r 26014c24323a test-data/Test_data_regions.txt --- a/test-data/Test_data_regions.txt Wed Oct 17 05:23:33 2018 -0400 +++ b/test-data/Test_data_regions.txt Fri Oct 26 07:54:03 2018 -0400 |
| b |
| @@ -1,17 +1,16 @@ -87_636 AAAAAACATCCCAATAAGAAATCA -87_636 AAAAAAGTCCTTCGACTCAAGCGG -87_636 AAAAAATAGTTAAGCCGACACACT -87_636 AAAAAATGTGCCGAACCTTGGCGA -87_636 AAAAACAACATAGCTTGAAAATTT -656_1143 ATTCGGATAATTCGACGCAACATT -656_1143 ATTCGTCGACAATACAAAGGGGCC -656_1143 ATTGCCAGTGTGGGCTGGTTAGTA -656_1143 ATTTCGCGACCATCCGCCACTTTG -656_1143 CAAACTTTAGCACAGTGTGTGTCC -1141_1564 ATAAACGGCCTTCGACATTGTGAC -1141_1564 ATAAAGTCACCTGTGAATACGTTG -1141_1564 ATAAATCGAAACCGTGCCCAACAA -1892_2398 ATTTAGATATTTTCTTCTTTTTCT -1892_2398 ATTTAGTTATCCGTCGGCGACGAA -1892_2398 ATTTAGTTTGAATTGCCCTGCGTC - +ACH_87_636 AAAAAACATCCCAATAAGAAATCA +ACH_87_636 AAAAAAGTCCTTCGACTCAAGCGG +ACH_87_636 AAAAAATAGTTAAGCCGACACACT +ACH_87_636 AAAAAATGTGCCGAACCTTGGCGA +ACH_87_636 AAAAACAACATAGCTTGAAAATTT +ACH_656_1143 ATTCGGATAATTCGACGCAACATT +ACH_656_1143 ATTCGTCGACAATACAAAGGGGCC +ACH_656_1143 ATTGCCAGTGTGGGCTGGTTAGTA +ACH_656_1143 ATTTCGCGACCATCCGCCACTTTG +ACH_656_1143 CAAACTTTAGCACAGTGTGTGTCC +ACH_1141_1564 ATAAACGGCCTTCGACATTGTGAC +ACH_1141_1564 ATAAAGTCACCTGTGAATACGTTG +ACH_1141_1564 ATAAATCGAAACCGTGCCCAACAA +ACH_1892_2398 ATTTAGATATTTTCTTCTTTTTCT +ACH_1892_2398 ATTTAGTTATCCGTCGGCGACGAA +ACH_1892_2398 ATTTAGTTTGAATTGCCCTGCGTC |
| b |
| diff -r 52454637bc45 -r 26014c24323a test-data/output_file.pdf |
| b |
| Binary file test-data/output_file.pdf has changed |
| b |
| diff -r 52454637bc45 -r 26014c24323a test-data/output_file.tabular --- a/test-data/output_file.tabular Wed Oct 17 05:23:33 2018 -0400 +++ b/test-data/output_file.tabular Fri Oct 26 07:54:03 2018 -0400 |
| b |
| @@ -5,10 +5,11 @@ relative frequency: 0.209 0.062 total nr. of reads 1312 +total nr. of tags 32 (16) Values from family size distribution - 87_636 656_1143 1141_1564 1892_2398 + ACH_87_636 ACH_656_1143 ACH_1141_1564 ACH_1892_2398 FS=3 0 0 0 1 FS=4 2 0 0 0 FS=5 2 0 0 0 @@ -32,10 +33,11 @@ In the plot, both family sizes of the ab and ba strands were used. -Whereas the total numbers indicate only the single count of the tags per region. +Whereas the total numbers indicate only the count of the tags per region. + + Region total nr. of tags per region -87_636 5 -656_1143 5 -1141_1564 3 -1892_2398 3 -sum of tags 16 +ACH_87_636 5 +ACH_656_1143 5 +ACH_1141_1564 3 +ACH_1892_2398 3 |