Repository 'hd'
hg clone https://toolshed.g2.bx.psu.edu/repos/mheinzl/hd

Changeset 19:2e9f7ea7ae93 (2018-10-08)
Previous changeset 18:a8581bf627fd (2018-05-23) Next changeset 20:b084b6a8e3ac (2018-12-14)
Commit message:
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit dfaab79252a858e8df16bbea3607ebf1b6962e5a-dirty
modified:
hd.py
hd.xml
added:
test-data/Test_data.tabular
test-data/Test_data2.tabular
test-data/output_file.pdf
test-data/output_file.tabular
test-data/output_file2.pdf
test-data/output_file2.tabular
b
diff -r a8581bf627fd -r 2e9f7ea7ae93 hd.py
--- a/hd.py Wed May 23 14:47:43 2018 -0400
+++ b/hd.py Mon Oct 08 05:56:04 2018 -0400
[
b'@@ -13,34 +13,35 @@\n # It is also possible to perform the HD analysis with shortened tags with given sizes as input.\n # The tool can run on a certain number of processors, which can be defined by the user.\n \n-# USAGE: python HDnew6_1Plot_FINAL.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /\n-#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False--output_csv outptufile_name_csv --output_pdf outptufile_name_pdf\n+# USAGE: python hd.py --inputFile filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --sample_size int/0 --sep "characterWhichSeparatesCSVFile" /\n+#        --only_DCS True --FamilySize3 True --subset_tag True --nproc int --minFS int --maxFS int --nr_above_bars True/False --output_tabular outptufile_name_tabular --output_pdf outputfile_name_pdf\n \n-import numpy\n+import argparse\n import itertools\n import operator\n-import matplotlib.pyplot as plt\n-import os.path\n-import cPickle as pickle\n-from multiprocessing.pool import Pool\n+import sys\n+from collections import Counter\n from functools import partial\n-import argparse\n-import sys\n-import os\n+from multiprocessing.pool import Pool\n+\n+import matplotlib.pyplot as plt\n+import numpy\n from matplotlib.backends.backend_pdf import PdfPages\n-from collections import Counter\n+\n+plt.switch_backend(\'agg\')\n \n-def plotFSDwithHD2(familySizeList1,maximumXFS,minimumXFS, originalCounts,\n-                   title_file1, subtitle, pdf, relative=False, diff = True):\n+\n+def plotFSDwithHD2(familySizeList1, maximumXFS, minimumXFS, originalCounts,\n+                   title_file1, subtitle, pdf, relative=False, diff=True):\n     if diff is False:\n         colors = ["#e6194b", "#3cb44b", "#ffe119", "#0082c8", "#f58231", "#911eb4"]\n-        labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8","HD>8"]\n+        labels = ["HD=1", "HD=2", "HD=3", "HD=4", "HD=5-8", "HD>8"]\n     else:\n         colors = ["#93A6AB", "#403C14", "#731E41", "#BAB591", "#085B6F", "#E8AA35", "#726C66"]\n         if relative is True:\n             labels = ["d=0", "d=0.1", "d=0.2", "d=0.3", "d=0.4", "d=0.5-0.8", "d>0.8"]\n         else:\n-            labels = ["d=0","d=1", "d=2", "d=3", "d=4", "d=5-8","d>8"]\n+            labels = ["d=0", "d=1", "d=2", "d=3", "d=4", "d=5-8", "d>8"]\n \n     fig = plt.figure(figsize=(6, 7))\n     ax = fig.add_subplot(111)\n@@ -54,11 +55,11 @@\n         range1 = range(0, maximumXFS + 2)\n     counts = plt.hist(familySizeList1, label=labels,\n                       color=colors, stacked=True,\n-                      rwidth=0.8,alpha=1, align="left",\n-                      edgecolor="None",bins=range1)\n+                      rwidth=0.8, alpha=1, align="left",\n+                      edgecolor="None", bins=range1)\n     plt.legend(loc=\'upper right\', fontsize=14, frameon=True, bbox_to_anchor=(1.45, 1))\n \n-    #plt.title(title_file1, fontsize=12)\n+    # plt.title(title_file1, fontsize=12)\n     plt.suptitle(subtitle, y=1, x=0.5, fontsize=14)\n     plt.xlabel("Family size", fontsize=14)\n     plt.ylabel("Absolute Frequency", fontsize=14)\n@@ -79,20 +80,17 @@\n     plt.text(0.15, -0.08, legend, size=12, transform=plt.gcf().transFigure)\n \n     count = numpy.bincount(originalCounts)  # original counts\n-    legend1 = "{}\\n{}\\n{:.5f}" \\\n-        .format(max(originalCounts), count[len(count) - 1], float(count[len(count) - 1]) / sum(count))\n+    legend1 = "{}\\n{}\\n{:.5f}".format(max(originalCounts), count[len(count) - 1], float(count[len(count) - 1]) / sum(count))\n     plt.text(0.5, -0.08, legend1, size=12, transform=plt.gcf().transFigure)\n-    legend3 = "singletons\\n{:,}\\n{:.5f}".format(int(counts[0][len(counts[0]) - 1][1]),\n-                                                float(counts[0][len(counts[0]) - 1][1]) / sum(\n-                                                    counts[0][len(counts[0]) - 1]))\n+    legend3 = "singletons\\n{:,}\\n{:.5f'..b'sumCol13 = createTableHD(listRelDifference1, "diff=")\n             overallSum13 = sum(sumCol13)\n \n-            ## FSD\n-            # absolute difference\n-        #    summary19, sumCol19 = createTableFSD2(familySizeList1_diff)\n-        #    overallSum19 = sum(sumCol19)\n-            # relative difference\n-         #   summary21, sumCol21 = createTableFSD2(familySizeList1_reldiff)\n-          #  overallSum21 = sum(sumCol21)\n-\n             # chimeric reads\n             if len(minHD_tags_zeros) != 0:\n                 # absolute difference and tags where at least one half has HD=0\n                 summary15, sumCol15 = createTableHD(listDifference1_zeros, "HD=")\n                 overallSum15 = sum(sumCol15)\n-                # absolute difference and tags where at least one half has HD=0\n-           #     summary23, sumCol23 = createTableFSD2(familySizeList1_diff_zeros, diff=False)\n-            #    overallSum23 = sum(sumCol23)\n \n             output_file.write("{}\\n".format(name_file))\n             output_file.write("number of tags per file{}{:,} (from {:,}) against {:,}\\n\\n".format(sep, len(\n                 numpy.concatenate(list1)), lenTags, lenTags))\n \n-            ### HD ###\n+            # HD\n             createFileHD(summary, sumCol, overallSum, output_file,\n                          "Hamming distance separated by family size", sep)\n-            ### FSD ###\n+            # FSD\n             createFileFSD2(summary5, sumCol5, overallSum5, output_file,\n                            "Family size distribution separated by Hamming distance", sep,\n                            diff=False)\n \n             count = numpy.bincount(quant)\n-            #output_file.write("{}{}\\n".format(sep, name_file))\n+            # output_file.write("{}{}\\n".format(sep, name_file))\n             output_file.write("\\n")\n             output_file.write("max. family size:{}{}\\n".format(sep, max(quant)))\n             output_file.write("absolute frequency:{}{}\\n".format(sep, count[len(count) - 1]))\n             output_file.write(\n                 "relative frequency:{}{}\\n\\n".format(sep, float(count[len(count) - 1]) / sum(count)))\n \n-            ### HD within tags ###\n+            # HD within tags\n             output_file.write(\n                 "The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.\\n"\n                 "It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.\\n")\n@@ -1019,24 +975,13 @@\n             createFileHD(summary13, sumCol13, overallSum13, output_file,\n                          "Chimera analysis: relative delta Hamming distances", sep)\n \n-        #    createFileFSD2(summary19, sumCol19, overallSum19, output_file,\n-         #                  "Family size distribution separated by absolute delta Hamming distance",\n-          #                 sep)\n-          #  createFileFSD2(summary21, sumCol21, overallSum21, output_file,\n-           #                "Family size distribution separated by relative delta Hamming distance",\n-            #               sep, rel=True)\n-\n             if len(minHD_tags_zeros) != 0:\n                 output_file.write(\n                     "Chimeras:\\nAll tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.\\nSo the hamming distance of the non-identical half is compared.\\n")\n                 createFileHD(summary15, sumCol15, overallSum15, output_file,\n                              "Hamming distances of non-zero half", sep)\n-         #       createFileFSD2(summary23, sumCol23, overallSum23, output_file,\n-          #                     "Family size distribution separated by Hamming distance of non-zero half",\n-           #                    sep, diff=False)\n             output_file.write("\\n")\n \n \n-\n if __name__ == \'__main__\':\n     sys.exit(Hamming_Distance_Analysis(sys.argv))\n'
b
diff -r a8581bf627fd -r 2e9f7ea7ae93 hd.xml
--- a/hd.xml Wed May 23 14:47:43 2018 -0400
+++ b/hd.xml Mon Oct 08 05:56:04 2018 -0400
[
b'@@ -1,33 +1,32 @@\n <?xml version="1.0" encoding="UTF-8"?>\n-<tool id="hd" name="Duplex Sequencing Analysis: hd" version="0.0.19">\n+<tool id="hd" name="Duplex Sequencing Analysis: hd" version="1.0.0">\n+    <description>Hamming distance (HD) analysis of tags</description>\n     <requirements>\n         <requirement type="package" version="2.7">python</requirement>\n-        <requirement type="package" version="1.4">matplotlib</requirement>\n+        <requirement type="package" version="1.4.0">matplotlib</requirement>\n     </requirements>\n-    <description>Hamming distance (HD) analysis of tags</description>\n     <command>\n-        python2 $__tool_directory__/hd.py --inputFile "$inputFile" --inputName1 "$inputFile.name" --inputFile2 "$inputFile2" --inputName2 "$inputFile2.name" --sample_size $sampleSize --sep $separator --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS\n-\t\t$nr_above_bars --output_pdf $output_pdf --output_csv $output_csv \n+        python2 \'$__tool_directory__/hd.py\' --inputFile \'$inputFile\' --inputName1 \'$inputFile.name\' --inputFile2 \'$inputFile2\' --inputName2 \'$inputFile2.name\' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS\n+\t\t$nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular \n         #if $inputFile2:\n-        --output_pdf2 $output_pdf2 --output_csv2 $output_csv2\n+        --output_pdf2 $output_pdf2 --output_tabular2 $output_tabular2\n         #end if\n     </command>\n     <inputs>\n         <param name="inputFile" type="data" format="tabular" label="Dataset 1: input tags" optional="false"/>\n-        <param name="inputFile2" type="data" format="tabular" label="Dataset 2: input tags" optional="true" help="Input in tabular format with the family size, tags and the direction of the strand (\'ab\' or \'ba\') for each family."/>\n+        <param name="inputFile2" type="data" format="tabular" label="Dataset 2: input tags" optional="true" help="Input in tabular format with the family size, tag and the direction of the strand (\'ab\' or \'ba\') for each family."/>\n         <param name="sampleSize" type="integer" label="number of tags in the sample" value="1000" min="0" help="specifies the number of tags in one analysis. If sample size is 0, all tags of the dataset are compared against all tags."/>\n-        <param name="minFS" type="integer" label="minimum family size of the tags" min="1" value="1" help="filters the tags after their family size: Families with smaller size are skipped. Default: min. family size = 1."/>\n-        <param name="maxFS" type="integer" label="max family size of the tags" min="0" value="0" help="filters the tags after their family size: Families with larger size are skipped. If max. family size is 0, no upper bound is defined and the maximum family size in the analysis will be the maximum family size of the whole dataset. Default: max. family size = 0."/>\n-        <param name="separator" type="text" label="Separator of the CSV file." help="can be a single character" value=","/>\n-        <param name="onlyDCS" type="boolean" label="only DCS in the analysis?" truevalue="" falsevalue="--only_DCS" checked="False" help="Only tags, which have a partner tag in the dataset, are included in the analysis."/>\n-        <param name="subsetTag" type="integer" label="shorten tag in the analysis?" value="0" help="An analysis with shorter tag length, which is specified by this parameter, is simulated. If this parameter is 0 (by default), the tag with its original length is used in the analysis."/>\n+        <param name="minFS" type="integer" label="minimum family size of the tags" min="1" value="1" help="filters the tags after their family size: Families with a smaller size are skipped. Default: min. family size = 1."/>\n+        <param name="maxFS" type="integer" label="max family size of the tags" min="0" value="0" help="filters the tags after their family size: Families with a larger size are skipped. If max. family size is '..b'a) in the dataset, are included in the analysis."/>\n+        <param name="subsetTag" type="integer" label="shorten tag in the analysis?" value="0" help="By this parameter an analysis with shorter tag length is simulated. If this parameter is 0 (by default), the tags with its original length are used in the analysis."/>\n         <param name="nproc" type="integer" label="number of processors" value="8" help="Number of processor used for computing."/>\n-        <param name="nr_above_bars" type="boolean" label="include numbers above bars?" truevalue="--nr_above_bars" falsevalue="" checked="True" help="The absolute and relative values of the bar can be included or removed in the plot. "/>\n+        <param name="nr_above_bars" type="boolean" label="include numbers above bars?" truevalue="--nr_above_bars" falsevalue="" checked="True" help="The absolute and relative values of the data can be included or removed from the plots. "/>\n  \n     </inputs>\n     <outputs>\n-        <data name="output_csv" format="csv"/>\n-        <data name="output_csv2" format="csv">\n+        <data name="output_tabular" format="tabular"/>\n+        <data name="output_tabular2" format="tabular">\n             <filter>inputFile2</filter>\n         </data>\n         <data name="output_pdf" format="pdf" />\n@@ -35,19 +34,30 @@\n             <filter>inputFile2</filter>\n         </data>\n     </outputs>\n+    <tests>\n+        <test>\n+            <param name="inputFile" value="Test_data.tabular"/>\n+            <param name="inputFile2" value="Test_data2.tabular"/>\n+            <param name="sampleSize" value="0"/>\n+            <output name="output_pdf" file="output_file.pdf" lines_diff="6"/>\n+            <output name="output_tabular" file="output_file.tabular"/>\n+            <output name="output_pdf2" file="output_file2.pdf" lines_diff="6"/>\n+            <output name="output_tabular2" file="output_file2.tabular"/>\n+        </test>\n+    </tests>\n     <help> <![CDATA[\n **What it does**\n     \n     This tool calculates the Hamming distance for the tags by comparing them to all tags in the dataset and finally searches for the minimum Hamming distance. \n     The Hamming distance is shown in a histogram separated by the family sizes or in a family size distribution separated by the Hamming distances. \n     This similarity measure was calculated for each tag to distinguish whether similar tags truly stem from different molecules or occured due to sequencing or PCR errros. \n-    In addition the tags of chimeric reads can be identified by calculating the Hamming distance for each half of the tag. \n+    In addition, the tags of chimeric reads can be identified by calculating the Hamming distance for each half of the tag. \n     This analysis can be performed on only a sample (by default: sample size=1000) or on the whole dataset (sample size=0). \n-    It is also possible to select on only those tags, which have a partner tag in the dataset (DCSs) or to filter the dataset after the tag\'s family size.  \n+    It is also possible to select on only those tags, which have a partner tag (ab and ba) in the dataset (DCSs) or to filter the dataset after the tag\'s family size. \n     \n **Input**\n     \n-    This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. \n+    This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. It is possible to upload two files which allows the performance of two analyses at the same time.\n     \n     +-----+----------------------------+----+\n     | 1   | AAAAAAAAAAAATGTTGGAATCTT   | ba |\n@@ -60,7 +70,7 @@\n     \n **Output**\n     \n-    The output is one PDF file with the plots of the Hamming distance and a CSV with the data of the plot for each dataset.\n+    The output is one PDF file with the plots of the Hamming distance and a tabular file with the data of the plot for each dataset.\n     \n     \n **About Author**\n'
b
diff -r a8581bf627fd -r 2e9f7ea7ae93 test-data/Test_data.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data.tabular Mon Oct 08 05:56:04 2018 -0400
b
@@ -0,0 +1,20 @@
+1 AAAAAAAAAAAAAACCAAAACTTC ba
+1 AAAAAAAAAAAAACCAGGCGTCGA ba
+1 AAAAAAAAAAAAAGCTCCACGTTG ba
+1 AAAAAAAAAAAAATCGTGGTTTGT ba
+1 AAAAAAAAAAAAATTCACCCTTGT ba
+7 AAAAAAAAAAAACACACTTAACTT ba
+1 AAAAAAAAAAAACAGTGTTGAGAC ba
+4 AAAAAAAAAAAACCGCTCCTCACA ba
+1 AAAAAAAAAAAAGGCAACACAGAA ab
+2 AAAAAAAAAAAATCTTTCTTTGAG ab
+1 AAAAAAAAAAAATTGGGTTCCTTA ab
+1 AAAAAAAAAAAGAGTCGCACCCAG ba
+4 AAAAAAAAAAAGATCGTGGTTTGT ba
+1 AAAAAAAAAAAGCGCAACACAGAA ab
+3 AAAAAAAAAAAGGGCAACACAGAA ab
+1 AAAAAAAAAAAGTAGCCCTAAACG ab
+1 AAAAAAAAAAAGTCTTTCTTTGAG ab
+1 AAAAAAAAAAATATCATAGACTCT ab
+6 AAAAAAAAAAATATTCACCCTTGT ba
+1 AAAAAAAAAAATATTCGAAAGTTA ba
\ No newline at end of file
b
diff -r a8581bf627fd -r 2e9f7ea7ae93 test-data/Test_data2.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Test_data2.tabular Mon Oct 08 05:56:04 2018 -0400
b
@@ -0,0 +1,20 @@
+1 AAAAAAAACCGCCCAACTGCCGGT ab
+5 AAAAAAAACCTCTCAACCCCAAAT ba
+7 AAAAAAAACCTCTTGCGATGTTGT ab
+1 AAAAAAAACCTCTTGCGCTGTTGT ab
+1 AAAAAAAACCTCTTGTGATGTTGT ab
+12 AAAAAAAACCTGAGCAATGGTTCC ab
+3 AAAAAAAACCTTGACCCTCACATG ba
+6 AAAAAAAACCTTGCACTCGTCCTA ba
+9 AAAAAAAACGAAATAAAAAAACCT ba
+1 AAAAAAAACGACCGGCCTTAGACA ba
+4 AAAAAAAACGCCACCACCCCCTTT ab
+12 AAAAAAAACGCCACGGGCACTATT ba
+13 AAAAAAAACGTATCAGTAGATCCT ab
+1 AAAAAAAACTAGTAGGATTTCATG ba
+3 AAAAAAAACTATAGAAAATCCATT ba
+1 AAAAAAAACTATTCTATTTCCGAT ba
+13 AAAAAAAACTGATCTGCTTGGCGG ba
+8 AAAAAAAACTTGCGAATAGCATCG ba
+4 AAAAAAAACTTGTTATCAAAACGT ab
+1 AAAAAAAAGAAAAGTTCAACACGC ba
\ No newline at end of file
b
diff -r a8581bf627fd -r 2e9f7ea7ae93 test-data/output_file.pdf
b
Binary file test-data/output_file.pdf has changed
b
diff -r a8581bf627fd -r 2e9f7ea7ae93 test-data/output_file.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_file.tabular Mon Oct 08 05:56:04 2018 -0400
b
@@ -0,0 +1,85 @@
+Test_data
+number of tags per file 20 (from 20) against 20
+
+Hamming distance separated by family size
+ FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum
+HD=1 5 1 1 1 1 0 9
+HD=6 3 0 0 0 0 0 3
+HD=7 4 0 0 0 1 0 5
+HD=8 2 0 0 1 0 0 3
+sum 14 1 1 2 2 0 20
+
+Family size distribution separated by Hamming distance
+ HD=1 HD=2 HD=3 HD=4 HD=5-8 HD>8 sum
+FS=1 5 0 0 0 9 0 14
+FS=2 1 0 0 0 0 0 1
+FS=3 1 0 0 0 0 0 1
+FS=4 1 0 0 0 1 0 2
+FS=6 1 0 0 0 0 0 1
+FS=7 0 0 0 0 1 0 1
+sum 9 0 0 0 11 0 20
+
+
+max. family size: 7
+absolute frequency: 1
+relative frequency: 0.05
+
+The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.
+It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.
+actual number of tags with min HD = 171 (sample size by user = 20)
+length of one part of the tag = 12
+
+Hamming distance of each half in the tag
+ HD a HD b' HD b HD a' HD a+b sum
+HD=0 146 0 8 4 0 158
+HD=1 0 2 2 21 11 36
+HD=2 0 0 0 0 1 1
+HD=5 0 0 4 0 0 4
+HD=6 0 2 2 0 6 10
+HD=7 0 16 9 0 21 46
+HD=8 0 20 0 0 26 46
+HD=9 0 50 0 0 50 100
+HD=10 0 30 0 0 30 60
+HD=11 0 18 0 0 18 36
+HD=12 0 8 0 0 8 16
+sum 146 146 25 25 171 513
+
+Absolute delta Hamming distances within the tag
+ FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum
+diff=0 1 0 0 0 0 0 1
+diff=1 6 1 2 1 1 0 11
+diff=4 4 0 0 0 0 0 4
+diff=5 2 0 0 0 0 0 2
+diff=6 6 0 0 1 1 0 8
+diff=7 15 0 1 0 3 0 19
+diff=8 15 2 0 1 2 0 20
+diff=9 37 4 1 4 4 0 50
+diff=10 22 2 1 4 1 0 30
+diff=11 8 1 1 5 3 0 18
+diff=12 6 1 0 1 0 0 8
+sum 122 11 6 17 15 0 171
+
+Chimera analysis: relative delta Hamming distances
+ FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum
+diff=0.0 1 0 0 0 0 0 1
+diff=0.7 6 0 0 0 0 0 6
+diff=0.8 4 0 0 1 1 0 6
+diff=1.0 111 11 6 16 14 0 158
+sum 122 11 6 17 15 0 171
+
+Chimeras:
+All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.
+So the hamming distance of the non-identical half is compared.
+Hamming distances of non-zero half
+ FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum
+HD=1 6 1 2 1 1 0 11
+HD=6 2 0 0 0 0 0 2
+HD=7 15 0 1 0 3 0 19
+HD=8 15 2 0 1 2 0 20
+HD=9 37 4 1 4 4 0 50
+HD=10 22 2 1 4 1 0 30
+HD=11 8 1 1 5 3 0 18
+HD=12 6 1 0 1 0 0 8
+sum 111 11 6 16 14 0 158
+
+
b
diff -r a8581bf627fd -r 2e9f7ea7ae93 test-data/output_file2.pdf
b
Binary file test-data/output_file2.pdf has changed
b
diff -r a8581bf627fd -r 2e9f7ea7ae93 test-data/output_file2.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_file2.tabular Mon Oct 08 05:56:04 2018 -0400
b
@@ -0,0 +1,97 @@
+Test_data2
+number of tags per file 20 (from 20) against 20
+
+Hamming distance separated by family size
+ FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum
+HD=1 2 0 0 0 1 0 3
+HD=6 0 0 0 1 0 1 2
+HD=7 2 0 1 1 2 1 7
+HD=8 1 0 1 0 2 1 5
+HD=9 1 0 0 0 0 1 2
+HD=10 1 0 0 0 0 0 1
+sum 7 0 2 2 5 4 20
+
+Family size distribution separated by Hamming distance
+ HD=1 HD=2 HD=3 HD=4 HD=5-8 HD>8 sum
+FS=1 2 0 0 0 3 2 7
+FS=3 0 0 0 0 2 0 2
+FS=4 0 0 0 0 2 0 2
+FS=5 0 0 0 0 1 0 1
+FS=6 0 0 0 0 1 0 1
+FS=7 1 0 0 0 0 0 1
+FS=8 0 0 0 0 1 0 1
+FS=9 0 0 0 0 1 0 1
+FS=12 0 0 0 0 2 0 2
+FS=13 0 0 0 0 1 1 2
+sum 3 0 0 0 14 3 20
+
+
+max. family size: 13
+absolute frequency: 2
+relative frequency: 0.1
+
+The hamming distances were calculated by comparing each half of all tags against the tag(s) with the minimum Hamming distance per half.
+It is possible that one tag can have the minimum HD from multiple tags, so the sample size in this calculation differs from the sample size entered by the user.
+actual number of tags with min HD = 79 (sample size by user = 20)
+length of one part of the tag = 12
+
+Hamming distance of each half in the tag
+ HD a HD b' HD b HD a' HD a+b sum
+HD=0 20 0 0 5 0 25
+HD=1 22 4 4 3 8 41
+HD=2 9 2 0 9 2 22
+HD=3 0 0 0 10 0 10
+HD=4 0 0 2 1 0 3
+HD=5 0 0 5 0 0 5
+HD=6 0 5 7 0 3 15
+HD=7 0 7 10 0 10 27
+HD=8 0 6 0 0 10 16
+HD=9 0 7 0 0 17 24
+HD=10 0 11 0 0 13 24
+HD=11 0 8 0 0 7 15
+HD=12 0 1 0 0 5 6
+HD=13 0 0 0 0 4 4
+sum 51 51 28 28 79 237
+
+Absolute delta Hamming distances within the tag
+ FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum
+diff=1 5 0 0 1 5 0 11
+diff=2 4 0 0 0 0 0 4
+diff=3 1 0 2 1 1 0 5
+diff=4 1 0 1 0 2 1 5
+diff=5 2 0 0 0 4 6 12
+diff=6 1 0 0 1 1 7 10
+diff=7 2 0 1 0 0 0 3
+diff=8 0 0 1 0 1 3 5
+diff=9 6 0 0 1 3 4 14
+diff=10 4 0 0 0 3 2 9
+diff=11 0 0 0 0 0 1 1
+sum 26 0 5 4 20 24 79
+
+Chimera analysis: relative delta Hamming distances
+ FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum
+diff=0.1 1 0 0 1 1 0 3
+diff=0.3 3 0 2 0 0 0 5
+diff=0.4 1 0 0 1 3 0 5
+diff=0.5 0 0 1 0 0 1 2
+diff=0.6 1 0 0 0 3 7 11
+diff=0.7 1 0 0 0 1 5 7
+diff=0.8 10 0 0 0 2 9 21
+diff=1.0 9 0 2 2 10 2 25
+sum 26 0 5 4 20 24 79
+
+Chimeras:
+All tags were filtered: only those tags where at least one half is identical with the half of the min. tag are kept.
+So the hamming distance of the non-identical half is compared.
+Hamming distances of non-zero half
+ FS=1 FS=2 FS=3 FS=4 FS=5-10 FS>10 sum
+HD=1 4 0 0 0 4 0 8
+HD=2 2 0 0 0 0 0 2
+HD=6 0 0 0 1 0 2 3
+HD=7 1 0 1 0 0 0 2
+HD=8 0 0 1 0 1 0 2
+HD=9 1 0 0 1 2 0 4
+HD=10 1 0 0 0 3 0 4
+sum 9 0 2 2 10 2 25
+
+