view tools/new_operations/column_join.xml @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
line wrap: on
line source

<tool id="column_join" name="Column Join" version="1.1.0">
  <description></description>
  <command interpreter="python">
    column_join.py
        --output=$output
        --input1=$input1
        --input2=$input2
        --hinge=$hinge
        --columns=$columns
        #if $fill_empty_columns.fill_empty_columns_switch == "fill_empty":
            --fill_options_file=$fill_options_file
        #end if
        #for $f in $file_chooser:
            ${f.input}
        #end for
  </command>
  <inputs>
    <param name="input1" type="data" format="tabular" label="Choose the first file for the join" />
    <param name="hinge" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Use this column and columns to left the 'hinge' (matching data for each join)" help="All columns to left of selected column (plus selected column) will be used. Select 2 for pileup" />
    <param name="columns" type="data_column" data_ref="input1" multiple="true" numerical="false" label="Include these column" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />
    <conditional name="fill_empty_columns">
      <param name="fill_empty_columns_switch" type="select" label="Fill empty columns">
        <option value="no_fill" selected="True">No</option>
        <option value="fill_empty">Yes</option>
      </param>
      <when value="no_fill" />
      <when value="fill_empty">
        <conditional name="do_fill_empty_columns">
          <param name="column_fill_type" type="select" label="Fill Columns by">
            <option value="single_fill_value" selected="True">Single fill value</option>
            <option value="fill_value_by_column">Values by column</option>
          </param>
          <when value="single_fill_value">
            <param type="text" name="fill_value" label="Fill value" value="." />
          </when>
          <when value="fill_value_by_column">
            <repeat name="column_fill" title="Fill Column">
              <param name="column_number" label="Column" type="data_column" data_ref="input1" />
              <param type="text" name="fill_value" value="." />
            </repeat>
          </when>
        </conditional>
      </when>
    </conditional>
    <param name="input2" type="data" format="tabular" label="Choose the second file for the join" />
    <repeat name="file_chooser" title="Additional Input">
      <param name="input" label="Additional input file" type="data" format="tabular" />
    </repeat>
  </inputs>
  <configfiles>
    <configfile name="fill_options_file">&lt;%
import simplejson
%&gt;
#set $__fill_options = {}
#if $fill_empty_columns['fill_empty_columns_switch'] == 'fill_empty':
    #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'single_fill_value':
        #set $__start_fill = $fill_empty_columns['do_fill_empty_columns']['fill_value'].value
    #else:
        #set $__start_fill = ""
    #end if
    #set $__fill_options['file1_columns'] = [ __start_fill for i in range( int( $input1.metadata.columns ) ) ]
    #if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'fill_value_by_column':
        #for column_fill in $fill_empty_columns['do_fill_empty_columns']['column_fill']:
            #set $__fill_options['file1_columns'][ int( column_fill['column_number'].value ) - 1 ] = column_fill['fill_value'].value
        #end for
    #end if
#end if
${simplejson.dumps( __fill_options )}
    </configfile>
  </configfiles>
  <outputs>
    <data name="output" format="tabular" />
  </outputs>
  <tests>
    <test>
      <param name="input1" value="column_join_in1.pileup" ftype="pileup" />
      <param name="hinge" value="2" />
      <param name="columns" value="1,2,3,4,5,7" />
      <param name="fill_empty_columns_switch" value="fill_empty" />
      <param name="column_fill_type" value="single_fill_value" />
      <param name="fill_value" value="?" />
      <param name="input2" value="column_join_in2.pileup" ftype="pileup" />
      <param name="input" value="column_join_in3.pileup" ftype="pileup" />
      <output name="output" file="column_join_out1.pileup" ftype="tabular" />
    </test>
    <test>
      <param name="input1" value="column_join_in4.pileup" ftype="pileup" />
      <param name="hinge" value="2" />
      <param name="columns" value="1,2,3,4" />
      <param name="fill_empty_columns_switch" value="no_fill" />
      <param name="input2" value="column_join_in5.pileup" ftype="pileup" />
      <param name="input" value="column_join_in6.pileup" ftype="pileup" />
      <output name="output" file="column_join_out2.pileup" ftype="tabular" />
    </test>
<!--  This test is failing for an unclear reason (the column values do not get
      passed into the script), but passes in the browser
    <test>
      <param name="input1" value="column_join_in7.pileup" ftype="tabular" />
      <param name="hinge" value="2" />
      <param name="columns" value="3,4,5" />
      <param name="fill_empty_columns_switch" value="fill_empty" />
      <param name="column_fill_type" value="fill_value_by_column" />
      <param name="column_number" value="5" />
      <param name="fill_value" value="X" />
      <param name="input2" value="column_join_in8.pileup" ftype="tabular" />
      <param name="input" value="column_join_in9.pileup" ftype="tabular" />
      <output name="output" file="column_join_out3.pileup" ftype="tabular" />
    </test>
-->
    <test>
      <param name="input1" value="column_join_in10.pileup" ftype="pileup" />
      <param name="hinge" value="1" />
      <param name="columns" value="2,7" />
      <param name="fill_empty_columns_switch" value="no_fill" />
      <param name="input2" value="column_join_in11.pileup" ftype="pileup" />
      <param name="input" value="column_join_in12.pileup" ftype="pileup" />
      <output name="output" file="column_join_out4.pileup" ftype="tabular" />
    </test>
    <test>
      <!-- Test for handling missing column -->
      <param name="input1" value="column_join_in13.tabular" ftype="tabular" />
      <param name="hinge" value="1" />
      <param name="columns" value="5" />
      <param name="fill_empty_columns_switch" value="fill_empty" />
      <param name="column_fill_type" value="single_fill_value" />
      <param name="fill_value" value="0" />
      <param name="input2" value="column_join_in14.tabular" ftype="tabular" />
      <param name="input" value="column_join_in15.tabular" ftype="tabular" />
      <output name="output" file="column_join_out5.tabular" ftype="tabular" />
    </test>
  </tests>
  <help>
**What it does**

This tool allows you to join several files with the same column structure into one file, removing certain columns if necessary. The user needs to select a 'hinge', which is the number of left-most columns to match on. They also need to select the columns to include in the join, which should include the hinge columns, too.

Note that the files are expected to have the same number of columns. If for some reason the join column is missing (this only applies to the last column(s)), the tool attempts to handle this situation by inserting an empty item (or the appropriate filler) for that column on that row. This could lead to the situation where a row has a hinge but entirely empty or filled columns, if the hinge exists in at least one file but every file that has it is missing the join column. Also, note that the tool does not distinguish between a file missing the hinge altogether and a file having the hinge but missing the column (in both cases the column would be empty or filled). There is an example of this below.

-----

**General Example**

Given the following files::

  FILE 1
  chr2    1    T    6    .C...,     I$$III
  chr2    2    G    6    ..N..,     III@II
  chr2    3    C    7    ..C...,    I$IIIII
  chr2    4    G    7    .G....,    I#IIIII
  chr2    5    G    7    ...N..,    IIII#BI
  chr2    6    A    7    ..T...,    I$IDIII
  chr1    1    C    1    ^:.        I
  chr1    2    G    2    .^:.       $I
  chr1    3    A    2    ..         I%
  chr1    4    C    2    ..         I$
  chr1    5    T    3    ..^:.      I#I
  chr1    6    G    3    ..^:,      I#I

  FILE 2
  chr1    3    T    1    ^:.        I
  chr1    4    G    2    .^:.       $I
  chr1    5    T    2    ..         I%
  chr1    6    C    3    ..^:.      III
  chr1    7    G    3    ..^:.      I#I
  chr1    8    T    4    ...^:,     I#II
  chr2    77   C    6    .G...,     I$$III
  chr2    78   G    6    ..N..,     III@II
  chr2    79   T    7    ..N...,    I$IIIII
  chr2    80   C    7    .G....,    I#IIIII
  chr2    81   G    7    ...A..,    IIII#BI
  chr2    82   A    8    ...G...,   I$IDIIII
  chr2    83   T    8    .A.....N   IIIIIIII
  chr2    84   A    9    ......T.   I$IIIIIII

  FILE 3
  chr1    1    A    1    .          I
  chr1    2    T    2    G.         I$
  chr1    3    C    2    .,         I@
  chr1    4    C    3    ..N        III
  chr1    42   C    5    ...N^:.    III@I
  chr1    43   C    5    .N..^:.    IIIII
  chr1    44   T    5    .A..,      IA@II
  chr1    45   A    6    .N...^:.   IIIII$
  chr1    46   G    6    .GN..^:.   I@IIII
  chr1    47   A    7    ....^:..,  IIIII$I
  chr2    73   T    5    .N..,      II$II
  chr2    74   A    5    ....,      IIIII
  chr2    75   T    5    ....,      IIIII
  chr2    76   T    5    ....,      IIIII
  chr2    77   C    5    ....,      IIIBI
  chr2    78   T    5    ....,      IDIII

To join on columns 3 and 4 combining on columns 1 and 2, columns 1-4 should be selected for the 'Include these columns' option, and column 2 selected for the 'hinge'. With these settings, the following would be output::

  chr1    1    C    1              A    1
  chr1    2    G    2              T    2
  chr1    3    A    2    T    1    C    2
  chr1    4    C    2    G    2    C    3
  chr1    5    T    3    T    2
  chr1    6    G    3    C    3
  chr1    7              G    3
  chr1    8              T    4
  chr1    42                       C    5
  chr1    43                       C    5
  chr1    44                       T    5
  chr1    45                       A    6
  chr1    46                       G    6
  chr1    47                       A    7
  chr2    1    T    6
  chr2    2    G    6
  chr2    3    C    7
  chr2    4    G    7
  chr2    5    G    7
  chr2    6    A    7
  chr2    73                       T    5
  chr2    74                       A    5
  chr2    75                       T    5
  chr2    76                       T    5
  chr2    77             C    6    C    5
  chr2    78             G    6    T    5
  chr2    79             T    7
  chr2    80             C    7
  chr2    81             G    7
  chr2    82             A    8
  chr2    83             T    8
  chr2    84             A    9

**Example with missing columns**

Given the following input files::

  FILE 1
  1   A
  2   B   b
  4   C   c
  5   D
  6   E   e

  FILE 2
  1   M   m
  2   N
  3   O   o
  4   P   p
  5   Q
  7   R   r

if we join only column 3 using column 1 as the hinge and with a fill value of '0', this is what will be output::

  1   0   m
  2   b   0
  3   0   o
  4   c   p
  5   0   0
  6   e   0
  7   0   r

Row 5 appears in both files with the missing column, so it's got nothing but fill values in the output file.

  </help>
</tool>