Mercurial > repos > bgruening > join_files_on_column_fuzzy

<tool id="join_files_on_column_fuzzy" name="Join two files" version="1.0.1">
    <description>
        on column allowing a small difference
    </description>
    <requirements>
        <requirement type="package" version="3.6">python</requirement>
    </requirements>
    <command>
    <![CDATA[
python '$__tool_directory__/join_files_on_column_fuzzy.py'
--f1 '$f1'
--f2 '$f2'
--c1 $c1
--c2 $c2
--outfile '$merged_file'
$header
$add_distance
#if $merge_mode_select == 'closest':
    --closest
#end if:
--distance $distance
--unit $units

    ]]>
    </command>
    <inputs>
	<param argument="--f1" type="data" optional="true" format="tabular" label="1st file"
            help=""/>
        <param argument="--c1" type="data_column" data_ref="f1" label="Column to use from 1st file" help="The file needs to be sorted by this column, ascending."/>
        <param argument="--f2" type="data" optional="true" format="tabular" label="2nd file"
            help=""/>
        <param argument="--c2" type="data_column" data_ref="f2" label="Column to use from 2nd file" help="The file needs to be sorted by this column, ascending."/>

        <param argument="--header" type="boolean" checked="false" truevalue="--header" falsevalue="" label="Does the input files contain a header line" />
        <param argument="--add_distance" type="boolean" checked="false" truevalue="--add_distance" falsevalue="" label="Add an addional column with the calculated distance." />

        <param name="merge_mode_select" type="select" label="Choose the mode of merging.">
            <option value="closest" selected="True">Best match (in case of multiple best matches, only the first one is reported)</option>
            <option value="distance">All matches within the defined distance</option>
        </param>
        <param name="units" display="radio" type="select" value="ppm_value" label="Choose the metrics of your distance"
            help="ppm is useful for very small differences">
            <option value="absolute" selected="True">Absolute distance</option>
            <option value="ppm" >Distance in ppm</option>
        </param>
        <param name="distance" value="0.2" type="float" label="Allowed distance between the two values that will trigger a merge" help=""/>
    </inputs>
    <outputs>
        <data name="merged_file" format="tabular" />
    </outputs>

    <tests>
        <test>
            <param name="f1" value="file1.tab" ftype="tabular"/>
            <param name="f2" value="file2.tab" ftype="tabular"/>
            <param name="c1" value="1"/>
            <param name="c2" value="1"/>
            <param name="merge_mode_select" value="distance"/>
            <param name="distance" value="0.1"/>
            <param name="units" value="absolute"/>
            <output name="merged_file" file="no_header_result1.tab" />
        </test>
        <test>
            <param name="f1" value="file1_header.tab" ftype="tabular"/>
            <param name="f2" value="file2_header.tab" ftype="tabular"/>
            <param name="c1" value="1"/>
            <param name="c2" value="1"/>
            <param name="merge_mode_select" value="distance"/>
            <param name="distance" value="0.2"/>
            <param name="units" value="absolute"/>
            <param name="header" value="true"/>
            <output name="merged_file" file="header_result2.tab" />
        </test>
        <test>
            <param name="f1" value="file1_header.tab" ftype="tabular"/>
            <param name="f2" value="file2_header.tab" ftype="tabular"/>
            <param name="c1" value="1"/>
            <param name="c2" value="1"/>
            <param name="header" value="true"/>
            <param name="merge_mode_select" value="closest"/>
            <output name="merged_file" file="header_closest_result3.tab" />
        </test>
        <test>
            <param name="f1" value="file1_ppm.tab" ftype="tabular"/>
            <param name="f2" value="file2_ppm.tab" ftype="tabular"/>
            <param name="c1" value="1"/>
            <param name="c2" value="1"/>
            <param name="header" value="false"/>
            <param name="merge_mode_select" value="distance"/>
            <param name="distance" value="100"/>
            <param name="units" value="ppm"/>
            <output name="merged_file" file="no_header_ppm_result4.tab" />
        </test>
        <test>
            <param name="f1" value="file1_header.tab" ftype="tabular"/>
            <param name="f2" value="file2_header.tab" ftype="tabular"/>
            <param name="c1" value="1"/>
            <param name="c2" value="1"/>
            <param name="header" value="true"/>
            <param name="closest" value="true"/>
            <param name="add_distance" value="true"/>
            <output name="merged_file" file="header_closest_result5.tab" />
        </test>
        <test>
            <param name="f1" value="file1_ppm.tab" ftype="tabular"/>
            <param name="f2" value="file2_ppm.tab" ftype="tabular"/>
            <param name="c1" value="1"/>
            <param name="c2" value="1"/>
            <param name="header" value="false"/>
            <param name="merge_mode_select" value="distance"/>
            <param name="distance" value="100"/>
            <param name="units" value="ppm"/>
            <param name="add_distance" value="true"/>
            <output name="merged_file" file="no_header_ppm_result6.tab" />
        </test>
    </tests>
    <help>
<![CDATA[

Join two files on a common column. It is necessary to provide an allowed difference between both values as the maximum absolute difference or maximum parts per million (ppm) for matching.

Two modes are available:

1) In the **best match** mode: For each value in file 1 only the best matching value of file 2 is reported. In case of multiple best matches, only the closest match is reported.
2) In the **all matches** mode: All matches within the defined distance are reported.

Be aware that file 1 is the template file and therefore the same value in file 2 can be matched to multiple values in file 1


------

**Example**

**Input file 1** ::

    1
    2
    3
    4
    5


**Input file 2** ::

    1.1
    1.2
    2.2
    3.3
    4.4


**Joined file1 and 2** with best match and absolute distance 0.3::

    1   1.1   0.1
    2   2.2   0.2
    3   3.3   0.3

**Joined file1 and 2** with all matches and absolute distance 0.3::

    1   1.1   0.1
    1   1.2   0.2
    2   2.2   0.2
    3   3.3   0.3


]]>
    </help>
    <citations>
    </citations>
</tool>
author	bgruening
date	Tue, 29 May 2018 15:34:31 -0400
parents	f2068690addc
children