diff cgatools/tools/cgatools_1.5/join.xml @ 0:182426b32995 draft default tip

Uploaded
author completegenomics
date Mon, 18 Jun 2012 20:15:00 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cgatools/tools/cgatools_1.5/join.xml	Mon Jun 18 20:15:00 2012 -0400
@@ -0,0 +1,213 @@
+<tool id="cg_join" name="join(beta) 1.5" version="1.0.0">
+<!--
+This tool creates a GUI for the join function of cgatools from Complete Genomics, Inc.
+written 6-18-2012 by bcrain@completegenomics.com
+-->
+
+  <description>two tsv files based on equal fields or overlapping regions.</description> <!--adds description in toolbar-->
+
+  <requirements>
+  	<requirement type="binary">cgatools</requirement>
+  </requirements>
+
+  <command> <!--run executable-->
+		cgatools | head -1;
+		cgatools join --beta 
+		--input $inputA 
+		--input $inputB 
+		--output $output 
+		--output-mode $outmode 
+		$dump 
+		--select $col
+		#for $m in $matches <!--get all matched columns-->
+			--match ${m.match}
+		#end for
+		#if $range_overlap.range == 'yes'
+			#for $o in $range_overlap.overlaps <!--get all matched columns-->
+				--overlap ${o.overlap}
+			#end for
+			--overlap-mode $range_overlap.overlapmode
+			--overlap-fraction-A $range_overlap.fractionA
+			--boundary-uncertainty-A $range_overlap.boundaryA
+			--overlap-fraction-B $range_overlap.fractionB
+			--boundary-uncertainty-B $range_overlap.boundaryB
+		#end if
+  </command>
+
+  <outputs>
+		<data format="tabular" name="output" />
+  </outputs>
+  
+  <inputs>
+   	<!--form field to select input file A-->
+    <param name="inputA" type="data" format="tabular" label="Select input file A ">
+      <validator type="unspecified_build" />
+			<validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
+				metadata_name="dbkey" metadata_column="0"
+				message="cgatools is not currently available for this build."/>
+    </param>
+    
+  	<!--form field to select input file B-->
+    <param name="inputB" type="data" format="tabular" label="Select input file B ">
+      <validator type="unspecified_build" />
+			<validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
+				metadata_name="dbkey" metadata_column="0"
+				message="cgatools is not currently available for this build."/>
+    </param>
+    
+  	<!--form field to specify columns to print-->
+    <param name="col" type="text" value="A.*,B.*" size="40" label="Specify columns for output" help="The default value A.*,B.* prints all columns from both files, other selections enter in the format A.col_name1,A.col_name3,B.col_name1" />
+
+  	<!--form field to select output-mode-->
+		<param name="outmode" type="select" label="Select output mode">
+			<option value="full" selected="true">full (1 line for each match of records in A and B)</option>
+			<option value="compact">compact (1 line for each record in A, joining multiple records in B by semicolon)</option>
+			<option value="compact-pct">compact-pct (same as compact, annotated with % overlap)</option>
+		</param>
+
+		<!--form field to select columns to match-->
+		<param name="dump" type="select" label="Select records to print">
+			<option value="--always-dump" selected="true">print all records of A even if not matched in B</option>
+			<option value="">print only records of A that are matched in B</option>
+		</param>
+
+  	<!--form field to specify columns to match-->
+    <repeat name="matches" title="Exact match column">
+      <param name="match" type="text" size="40" label="Enter column:column" help="Enter column_from_A:column_from_B, e.g. chromosome:chromosome"/>
+    </repeat>
+    
+    <conditional name="range_overlap">
+    	<param name="range" type="select" label="Do you want to match columns by overlapping range?">
+    		<option value="no">no</option>
+    		<option value="yes">yes</option>
+    	</param>
+    	
+    	<when value="yes">
+				<!--form field to specify columns to overlap-->
+				<repeat name="overlaps" title="Range column">
+					<param name="overlap" type="text" size="40" label="Enter column&#91;,column&#93;:column&#91;,column&#93;" help="Enter range_start_from_A&#91;,range_stop_from_A&#93;:range_start_from_B&#91;,range_stop_from_B&#93;, e.g. begin,end:begin,end (overlapping range of positions) or begin,end:position"/>
+				</repeat>
+
+				<!--form field to select overlap-mode-->
+				<param name="overlapmode" type="select" label="Select overlap mode">
+					<option value="strict" selected="true">strict (overlap if A.begin&lt;B.end and B.begin&gt;A.end)</option>
+					<option value="allow-abutting-points">allow-abutting-points (overlap if A.begin&lt;B.end and B.begin&gt;A.end, or if A.begin&lt;=B.end and B.begin&lt;=A.end and either A or B has zero length.)</option>
+				</param>
+
+				<!--form fields to overlap options-->
+				<param name="fractionA" type="integer" value="0" label="Minimum fraction of A region overlap " />
+				<param name="boundaryA" type="integer" value="0" label="Boundary uncertainty for A for overlap filtering " help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-A * (A-range-length - boundary-uncertainty-A)"/>
+				
+				<param name="fractionB" type="integer" value="0" label="Minimum fraction of B region overlap " />
+				<param name="boundaryB" type="integer" value="0" label="Boundary uncertainty for overlap filtering "  help="Records failing the following boundary-uncertainty calculation are not included in the output: overlap length >= overlap-fraction-B * (B-range-length - boundary-uncertainty-B)"/>
+    	</when>
+		</conditional>
+  </inputs>
+  
+  <help>
+  
+**What it does**
+
+This tool joins two tab-delimited files based on equal fields or overlapping regions.
+
+**cgatools 1.5.0 Documentation**
+
+Userguide: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-user-guide.pdf
+
+Release notes: http://cgatools.sourceforge.net/docs/1.5.0/cgatools-release-notes.pdf
+
+**Command line reference**::
+
+		COMMAND NAME
+		  join - Joins two tab-delimited files based on equal fields or overlapping regions.
+		
+		DESCRIPTION
+		  Joins two tab-delimited files based on equal fields or overlapping regions.
+		  By default, an output record is produced for each match found between file 
+		  A and file B, but output format can be controlled by the --output-mode 
+		  parameter.
+		
+		OPTIONS
+		  -h [ --help ] 
+		      Print this help message.
+		
+		  --beta 
+		      This is a beta command. To run this command, you must pass the --beta 
+		      flag.
+		
+		  --input arg
+		      File name to use as input (may be passed in as arguments at the end of 
+		      the command), or omitted for stdin). There must be exactly two input 
+		      files to join. If only one file is specified by name, file A is taken 
+		      to be stdin and file B is the named file. File B is read fully into 
+		      memory, and file A is streamed. File A's columns appear first in the 
+		      output.
+		
+		  --output arg (=STDOUT)
+		      The output file name (may be omitted for stdout).
+		
+		  --match arg
+		      A match specification, which is a column from A and a column from B 
+		      separated by a colon.
+		
+		  --overlap arg
+		      Overlap specification. An overlap specification consists of a range 
+		      definition for files A and B, separated by a colon. A range definition 
+		      may be two columns, in which case they are interpreted as the beginning
+		      and end of the range. Or it may be one column, in which case the range 
+		      is defined as the 1-base range starting at the given value. The records
+		      from the two files must overlap in order to be considered for output. 
+		      Two ranges are considered to overlap if the overlap is at least one 
+		      base long, or if one of the ranges is length 0 and the ranges overlap 
+		      or abut. For example, "begin,end:offset" will match wherever end-begin 
+		      &gt; 0, begin&lt;offset+1, and end&gt;offset, or wherever end-begin = 0, 
+		      begin&lt;=offset+1, and end&gt;=offset.
+
+
+		  -m [ --output-mode ] arg (=full)
+		      Output mode, one of the following:
+		        full        Print an output record for each match found between 
+		                    file A and file B.
+		        compact     Print at most one record for each record of file A, 
+		                    joining the file B values by a semicolon and 
+		                    suppressing repeated B values and empty B values.
+		        compact-pct Same as compact, but for each distinct B value, 
+		                    annotate with the percentage of the A record that is 
+		                    overlapped by B records with that B value. Percentage 
+		                    is rounded up to nearest integer.
+		
+		  --overlap-mode arg (=strict)
+		      Overlap mode, one of the following:
+		        strict                Range A and B overlap if A.begin &lt; B.end and 
+		                              B.begin &lt; A.end.
+		        allow-abutting-points Range A and B overlap they meet the strict 
+		                              requirements, or if A.begin &lt;= B.end and 
+		                              B.begin &lt;= A.end and either A or B has zero 
+		                              length.
+
+		  --select arg (=A.*,B.*)
+		      Set of fields to select for output.
+		
+		  -a [ --always-dump ] 
+		      Dump every record of A, even if there are no matches with file B.
+		
+		  --overlap-fraction-A arg (=0)
+		      Minimum fraction of A region overlap for filtering output.
+		
+		  --boundary-uncertainty-A arg (=0)
+		      Boundary uncertainty for overlap filtering. Specifically, records 
+		      failing the following predicate are filtered away: overlap &gt;= 
+		      overlap-fraction-A * ( A-range-length - boundary-uncertainty-A )
+		
+		  --overlap-fraction-B arg (=0)
+		      Minimum fraction of B region overlap for filtering output.
+		
+		  --boundary-uncertainty-B arg (=0)
+		      Boundary uncertainty for overlap filtering. Specifically, records 
+		      failing the following predicate are filtered away: overlap &gt;= 
+		      overlap-fraction-B * ( B-range-length - boundary-uncertainty-B )
+
+		SUPPORTED FORMAT_VERSION
+		  Any
+  </help>
+</tool>