diff hd.xml @ 20:b084b6a8e3ac draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/hd commit e76960d95c059a78d880ed5ecd6202f54b091025
author mheinzl
date Fri, 14 Dec 2018 04:31:21 -0500
parents 2e9f7ea7ae93
children 7e570ba56b83
line wrap: on
line diff
--- a/hd.xml	Mon Oct 08 05:56:04 2018 -0400
+++ b/hd.xml	Fri Dec 14 04:31:21 2018 -0500
@@ -1,20 +1,16 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="hd" name="Duplex Sequencing Analysis: hd" version="1.0.0">
-    <description>Hamming distance (HD) analysis of tags</description>
+<tool id="hd" name="HD:" version="1.0.0">
+    <description>hamming distance analysis of duplex tags</description>
     <requirements>
         <requirement type="package" version="2.7">python</requirement>
         <requirement type="package" version="1.4.0">matplotlib</requirement>
     </requirements>
     <command>
-        python2 '$__tool_directory__/hd.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --inputFile2 '$inputFile2' --inputName2 '$inputFile2.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS
-		$nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular 
-        #if $inputFile2:
-        --output_pdf2 $output_pdf2 --output_tabular2 $output_tabular2
-        #end if
+        python2 '$__tool_directory__/hd.py' --inputFile '$inputFile' --inputName1 '$inputFile.name' --sample_size $sampleSize --subset_tag $subsetTag --nproc $nproc $onlyDCS --minFS $minFS --maxFS $maxFS
+		$nr_above_bars --output_pdf $output_pdf --output_tabular $output_tabular
     </command>
     <inputs>
-        <param name="inputFile" type="data" format="tabular" label="Dataset 1: input tags" optional="false"/>
-        <param name="inputFile2" type="data" format="tabular" label="Dataset 2: input tags" optional="true" help="Input in tabular format with the family size, tag and the direction of the strand ('ab' or 'ba') for each family."/>
+        <param name="inputFile" type="data" format="tabular" label="Dataset 1: input tags" optional="false" help="Input in tabular format with the family size, tag and the direction of the strand ('ab' or 'ba') for each family."/>
         <param name="sampleSize" type="integer" label="number of tags in the sample" value="1000" min="0" help="specifies the number of tags in one analysis. If sample size is 0, all tags of the dataset are compared against all tags."/>
         <param name="minFS" type="integer" label="minimum family size of the tags" min="1" value="1" help="filters the tags after their family size: Families with a smaller size are skipped. Default: min. family size = 1."/>
         <param name="maxFS" type="integer" label="max family size of the tags" min="0" value="0" help="filters the tags after their family size: Families with a larger size are skipped. If max. family size is 0, no upper bound is defined and the maximum family size in the analysis will be the maximum family size of the whole dataset. Default: max. family size = 0."/>
@@ -26,51 +22,65 @@
     </inputs>
     <outputs>
         <data name="output_tabular" format="tabular"/>
-        <data name="output_tabular2" format="tabular">
-            <filter>inputFile2</filter>
-        </data>
         <data name="output_pdf" format="pdf" />
-        <data name="output_pdf2" format="pdf" >
-            <filter>inputFile2</filter>
-        </data>
     </outputs>
     <tests>
         <test>
             <param name="inputFile" value="Test_data.tabular"/>
-            <param name="inputFile2" value="Test_data2.tabular"/>
             <param name="sampleSize" value="0"/>
             <output name="output_pdf" file="output_file.pdf" lines_diff="6"/>
             <output name="output_tabular" file="output_file.tabular"/>
-            <output name="output_pdf2" file="output_file2.pdf" lines_diff="6"/>
-            <output name="output_tabular2" file="output_file2.tabular"/>
         </test>
     </tests>
     <help> <![CDATA[
 **What it does**
     
-    This tool calculates the Hamming distance for the tags by comparing them to all tags in the dataset and finally searches for the minimum Hamming distance. 
-    The Hamming distance is shown in a histogram separated by the family sizes or in a family size distribution separated by the Hamming distances. 
-    This similarity measure was calculated for each tag to distinguish whether similar tags truly stem from different molecules or occured due to sequencing or PCR errros. 
-    In addition, the tags of chimeric reads can be identified by calculating the Hamming distance for each half of the tag. 
-    This analysis can be performed on only a sample (by default: sample size=1000) or on the whole dataset (sample size=0). 
-    It is also possible to select on only those tags, which have a partner tag (ab and ba) in the dataset (DCSs) or to filter the dataset after the tag's family size. 
+This tool calculates the Hamming distance for the tags by comparing them to all tags in the dataset and finally searches for the minimum Hamming distance. 
+The Hamming distance is shown in a histogram separated by the family sizes or in a family size distribution separated by the Hamming distances. 
+This similarity measure was calculated for each tag to distinguish whether similar tags truly stem from different molecules or occured due to sequencing or PCR errros. 
+In addition, the tags of chimeric reads can be identified by calculating the Hamming distance for each half of the tag. 
+This analysis can be performed on only a sample (by default: sample size=1000) or on the whole dataset (sample size=0). 
+It is also possible to select on only those tags, which have a partner tag (ab and ba) in the dataset (DCSs) or to filter the dataset after the tag's family size. 
     
 **Input**
     
-    This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands. It is possible to upload two files which allows the performance of two analyses at the same time.
+This tools expects a tabular file with the tags of all families, their sizes and information about forward (ab) and reverse (ba) strands::
     
-    +-----+----------------------------+----+
-    | 1   | AAAAAAAAAAAATGTTGGAATCTT   | ba |
-    +-----+----------------------------+----+
-    | 10  | AAAAAAAAAAAGGCGGTCCACCCC   | ab |
-    +-----+----------------------------+----+
-    | 28  | AAAAAAAAAAATGGTATGGACCGA   | ab |
-    +-----+----------------------------+----+
+    1  AAAAAAAAAAAATGTTGGAATCTT ba
+   10  AAAAAAAAAAAGGCGGTCCACCCC ab
+   28  AAAAAAAAAAATGGTATGGACCGA ab
+
+**How to generate the input**
+
+The first step of the `Du Novo Analysis Pipeline <https://doi.org/10.1186/s13059-016-1039-4>`_ is the **Make Families** tool that produces output in this form::
+
+    1                        2  3     4
+    ------------------------------------------------------
+    AAAAAAAAAAAAAAATAGCTCGAT ba read1 CGCTACGTGACTGGGTCATG
+    AAAAAAAAAAAAAAATAGCTCGAT ba read2 CGCTACGTGACTGGGTCATG
+    AAAAAAAAAAAAAAATAGCTCGAT ba read3 CGCTACGTGACTGGGTCATG
+
+   we only need columns 1 and 2. These two columns can be extracted from this dataset using **Cut** tool::
+
+    1                        2 
+    ---------------------------
+    AAAAAAAAAAAAAAATAGCTCGAT ba
+    AAAAAAAAAAAAAAATAGCTCGAT ba
+    AAAAAAAAAAAAAAATAGCTCGAT ba
+
+   now one needs to count the number of unique occurencies of each tag. This is done using **Unique lines** tool, which would add an additional column containg counts (column 1)::
+
+
+    1 2                        3 
+    -----------------------------
+    3 AAAAAAAAAAAAAAATAGCTCGAT ba
+ 
+   these data can now be used in this tool.
     
     
 **Output**
     
-    The output is one PDF file with the plots of the Hamming distance and a tabular file with the data of the plot for each dataset.
+The output is one PDF file with the plots of the Hamming distance and a tabular file with the data of the plot for each dataset.
     
     
 **About Author**