diff tools/stats/grouping.xml @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/stats/grouping.xml	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,123 @@
+<tool id="Grouping1" name="Group" version="2.0.0">
+  <description>data by a column and perform aggregate operation on other columns.</description>
+  <command interpreter="python">
+    grouping.py 
+      $out_file1
+      $input1
+      $groupcol
+      $ignorecase
+      #for $op in $operations
+       '${op.optype}
+        ${op.opcol}
+        ${op.opround}'
+      #end for
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Dataset missing? See TIP below."/>
+    <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" />
+    <param name="ignorecase" type="boolean" truevalue="1" falsevalue="0">
+      <label>Ignore case while grouping?</label>
+    </param>
+    <repeat name="operations" title="Operation">
+      <param name="optype" type="select" label="Type">
+        <option value="mean">Mean</option>
+        <option value="median">Median</option>
+        <option value="mode">Mode</option>
+        <option value="max">Maximum</option>
+        <option value="min">Minimum</option>
+        <option value="sum">Sum</option>
+        <option value="length">Count</option>
+        <option value="unique">Count Distinct</option>
+        <option value="cat">Concatenate</option>
+        <option value="cat_uniq">Concatenate Distinct</option>
+        <option value="random">Randomly pick</option>
+        <option value="std">Standard deviation</option>
+      </param>
+      <param name="opcol" label="On column" type="data_column" data_ref="input1" />
+      <param name="opround" type="select" label="Round result to nearest integer?">
+         <option value="no">NO</option>
+         <option value="yes">YES</option>
+       </param>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">numpy</requirement>
+  </requirements>
+  <tests>
+    <!-- Test valid data -->
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="groupcol" value="1"/>
+      <param name="ignorecase" value="true"/>
+      <param name="optype" value="mean"/>
+      <param name="opcol" value="2"/>
+      <param name="opround" value="no"/>
+      <output name="out_file1" file="groupby_out1.dat"/>
+    </test>
+    <!-- Long case but test framework doesn't allow yet
+    <test>
+      <param name="input1" value="1.bed"/>
+      <param name="groupcol" value="1"/>
+      <param name="ignorecase" value="false"/>
+      <param name="operations" value='[{"opcol": "2", "__index__": 0, "optype": "mean", "opround": "no"}, {"opcol": "2", "__index__": 1, "optype": "median", "opround": "no"}, {"opcol": "6", "__index__": 2, "optype": "mode", "opround": "no"}, {"opcol": "2", "__index__": 3, "optype": "max", "opround": "no"}, {"opcol": "2", "__index__": 4, "optype": "min", "opround": "no"}, {"opcol": "2", "__index__": 5, "optype": "sum", "opround": "no"}, {"opcol": "1", "__index__": 6, "optype": "length", "opround": "no"}, {"opcol": "1", "__index__": 7, "optype": "unique", "opround": "no"}, {"opcol": "1", "__index__": 8, "optype": "cat", "opround": "no"}, {"opcol": "6", "__index__": 9, "optype": "cat_uniq", "opround": "no"}, {"opcol": "2", "__index__": 10, "optype": "random", "opround": "no"}, {"opcol": "2", "__index__": 11, "optype": "std", "opround": "no"}]'/>
+      <output name="out_file1" file="groupby_out3.tabular"/>
+    </test>
+    -->
+    <!-- Test data with an invalid value in a column. Can't do it because test framework doesn't allow testing of errors
+    <test>
+      <param name="input1" value="1.tabular"/>
+      <param name="groupcol" value="1"/>
+      <param name="ignorecase" value="true"/>
+      <param name="optype" value="mean"/>
+      <param name="opcol" value="2"/>
+      <param name="opround" value="no"/>
+      <output name="out_file1" file="groupby_out2.dat"/>
+    </test>
+     -->
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation-&gt;Convert*
+
+-----
+
+**Syntax**
+
+This tool allows you to group the input dataset by a particular column and perform aggregate functions: Mean, Median, Mode, Sum, Max, Min, Count, Concatenate, and Randomly pick on any column(s).
+
+The Concatenate function will take, for each group, each item in the specified column and build a comma delimited list. Concatenate Unique will do the same but will build a list of unique items with no repetition.
+
+Count and Count Unique are equivalent to Concatenate and Concatenate Unique, but will only count the number of items and will return an integer.
+
+- If multiple modes are present, all are reported.
+
+-----
+
+**Example**
+
+- For the following input::
+
+   chr22  1000  1003  TTT
+   chr22  2000  2003  aaa
+   chr10  2200  2203  TTT
+   chr10  1200  1203  ttt
+   chr22  1600  1603  AAA
+
+- **Grouping on column 4** while ignoring case, and performing operation **Count on column 1** will return::
+
+   AAA    2
+   TTT    3
+   
+- **Grouping on column 4** while not ignoring case, and performing operation **Count on column 1** will return::
+
+   aaa    1
+   AAA    1
+   ttt    1
+   TTT    2
+  </help>
+</tool>