changeset 0:64469e7ecf9f draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/join_files_on_column_fuzzy commit fd2bc86108994c9eda731b305ca6a8c71554cfaa
author bgruening
date Sun, 26 Nov 2017 16:13:51 -0500
parents
children 8750c3125ec5
files join_files_on_column_fuzzy.py join_files_on_column_fuzzy.xml test-data/file1.tab test-data/file1_header.tab test-data/file1_ppm.tab test-data/file2.tab test-data/file2_header.tab test-data/file2_ppm.tab test-data/header_closest_result3.tab test-data/header_closest_result5.tab test-data/header_result2.tab test-data/no_header_ppm_result4.tab test-data/no_header_ppm_result6.tab test-data/no_header_result1.tab
diffstat 14 files changed, 457 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/join_files_on_column_fuzzy.py	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+
+import os
+import argparse
+import sys
+
+def main(args):
+
+    if args.header:
+        h1 = True
+        h2 = True
+    else:
+        h1 = False
+        h2 = False
+
+    cache = list()
+    out = open(args.outfile, 'w+')
+    write_buffer = list()
+
+    def _readline(header = False):
+        with open(args.f2) as handle2:
+            for line in handle2:
+                line = line.strip()
+                if header:
+                    header = False
+                    yield line
+                    continue
+                if not line:
+                    continue
+                columns = line.split(args.sep)
+                value2 = columns[args.c2-1]
+                yield columns, float(value2)
+
+    def fill_cache():
+        try:
+            cache.append(next(it))
+        except StopIteration:
+           pass
+
+    it = _readline(header = h2)
+
+    with open(args.f1) as handle1:
+        for line in handle1:
+            line = line.strip()
+            if h1:
+                h1 = False
+                seconda_header = next(it)
+                if args.add_distance:
+                    out.write('%s\t%s\t%s\n' % (line, seconda_header, args.unit))
+                else:
+                    out.write('%s\t%s\n' % (line, seconda_header))
+                continue
+            if not line:
+                continue
+            columns = line.split(args.sep)
+            value1 = float(columns[args.c1-1])
+            _cache = list()
+            fill_cache()
+            while cache:
+                _c, value2 = cache.pop(0)
+                upper_bound = value1 + args.distance
+                if args.unit == 'absolute':
+                    if value2 <= upper_bound and value2 >= (value1 - args.distance):
+                        line_template = '%s\n'
+                        abs_dist = abs(value1 - value2)
+                        if args.add_distance:
+                            line_template = '%s\t' + str(abs_dist) + '\n'
+                        write_buffer.append([abs_dist, line_template % '\t'.join( columns + _c )])
+                        _cache.append([_c, value2])
+                        fill_cache()
+                    elif value2 > upper_bound:
+                        # if the value from list 2 is bigger then the current value, he will be taken into the next round
+                        _cache.append([_c, value2])
+                    elif value2 < upper_bound:
+                        # if the value from list 2 is smaller then the currecnt value, check the next one of list 2
+                        fill_cache()
+                elif args.unit == 'ppm':
+                    ppm_dist = abs((value1 - value2) / value1 * 1000000)
+                    if ppm_dist <= args.distance:
+                        line_template = '%s\n'
+                        if args.add_distance:
+                            line_template = '%s\t' + str(ppm_dist) + '\n'
+                        write_buffer.append([ppm_dist, line_template % '\t'.join( columns + _c )])
+                        _cache.append([_c, value2])
+                        fill_cache()
+                    elif ppm_dist > args.distance:
+                        _cache.append([_c, value2])
+                    elif ppm_dist < args.distance:
+                        fill_cache()
+            if args.closest and write_buffer:
+                write_buffer.sort(key=lambda x: x[0])
+                out.write(write_buffer[0][1])
+            else:
+                for _dist, line in write_buffer:
+                    out.write(line)
+            write_buffer = list()
+            cache = _cache
+    out.close()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Merge two files on a common column the fuzzy way.')
+    parser.add_argument('--f1', required=True)
+    parser.add_argument('--f2', required=True)
+    parser.add_argument('--c1', type=int, required=True, help="Column in file 1 to be merged on.")
+    parser.add_argument('--c2', type=int, required=True, help="Column in file 2 to be merged on.")
+    parser.add_argument('--outfile', required=True)
+    parser.add_argument('--header', action='store_true', help="The files have a header line at the beginning.")
+    parser.add_argument('--closest', action='store_true', help="Only report the closest match.")
+    parser.add_argument('--add_distance', action='store_true', help="Add addional column with the distance between the two values.")
+    parser.add_argument('--sep', type=str, default="\t", help="Files are separated by this separator.")
+    parser.add_argument('--distance', type=float, default="0.2", help="Maximal allowed distance.")
+    parser.add_argument('--unit', choices=['ppm', 'absolute'], default='absolute')
+    args = parser.parse_args()
+
+    main(args)
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/join_files_on_column_fuzzy.xml	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,141 @@
+<tool id="join_files_on_column_fuzzy" name="Join two files" version="1.0.0">
+    <description>
+        on column allowing a small difference
+    </description>
+    <requirements>
+        <requirement type="package" version="3.6">python</requirement> 
+    </requirements>
+    <command>
+    <![CDATA[
+python '$__tool_directory__/join_files_on_column_fuzzy.py'
+--f1 '$f1'
+--f2 '$f2'
+--c1 $c1
+--c2 $c2
+--outfile '$merged_file'
+$header
+$add_distance
+#if $merge_mode.merge_mode_select == 'closest':
+    --closest
+#else:
+    --distance $merge_mode.distance
+    --unit $merge_mode.units
+#end if
+    ]]>
+    </command>
+    <inputs>
+        <param argument="--f1" type="data" optional="true" format="tabular" label="1st file"
+            help=""/>
+        <param argument="--c1" type="data_column" data_ref="f1" label="Column to use from 1st file" help="The file needs to be sorted by this column, ascending."/>
+        <param argument="--f2" type="data" optional="true" format="tabular" label="2nd file"
+            help=""/>
+        <param argument="--c2" type="data_column" data_ref="f2" label="Column to use from 2nd file" help="The file needs to be sorted by this column, ascending."/>
+
+        <param argument="--header" type="boolean" checked="false" truevalue="--header" falsevalue="" label="Does the input files contain a header line" />
+        <param argument="--add_distance" type="boolean" checked="false" truevalue="--add_distance" falsevalue="" label="Add an addional column with the calculated distance." />
+
+        <conditional name="merge_mode">
+            <param name="merge_mode_select" type="select" label="Choose the mode of merging.">
+                <option value="closest" selected="True">Best match (in case of multiple best matches, only the first one is reported)</option>
+                <option value="distance">Matching with a defined distance</option>
+            </param>
+            <when value="closest"/>
+            <when value="distance">
+                <param name="units" display="radio" type="select" value="ppm_value" label="Choose the metrics of your distance"
+                    help="ppm is useful for very small differences">
+                        <option value="absolute" selected="True">Absolute distance</option>
+                        <option value="ppm" >Distance in ppm</option>
+                </param>
+                <param name="distance" value="0.2" type="float" label="Allowed distance between the two values that will trigger a merge" help=""/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="merged_file" format="tabular" />
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="f1" value="file1.tab" ftype="tabular"/>
+            <param name="f2" value="file2.tab" ftype="tabular"/>
+            <param name="c1" value="1"/>
+            <param name="c2" value="1"/>
+            <param name="merge_mode_select" value="distance"/>
+            <param name="distance" value="0.1"/>
+            <param name="units" value="absolute"/>
+            <output name="merged_file" file="no_header_result1.tab" />
+        </test>
+        <test>
+            <param name="f1" value="file1_header.tab" ftype="tabular"/>
+            <param name="f2" value="file2_header.tab" ftype="tabular"/>
+            <param name="c1" value="1"/>
+            <param name="c2" value="1"/>
+            <param name="merge_mode_select" value="distance"/>
+            <param name="distance" value="0.2"/>
+            <param name="units" value="absolute"/>
+            <param name="header" value="true"/>
+            <output name="merged_file" file="header_result2.tab" />
+        </test>
+        <test>
+            <param name="f1" value="file1_header.tab" ftype="tabular"/>
+            <param name="f2" value="file2_header.tab" ftype="tabular"/>
+            <param name="c1" value="1"/>
+            <param name="c2" value="1"/>
+            <param name="header" value="true"/>
+            <param name="closest" value="true"/>
+            <output name="merged_file" file="header_closest_result3.tab" />
+        </test>
+        <test>
+            <param name="f1" value="file1_ppm.tab" ftype="tabular"/>
+            <param name="f2" value="file2_ppm.tab" ftype="tabular"/>
+            <param name="c1" value="1"/>
+            <param name="c2" value="1"/>
+            <param name="header" value="false"/>
+            <param name="merge_mode_select" value="distance"/>
+            <param name="distance" value="100"/>
+            <param name="units" value="ppm"/>
+            <output name="merged_file" file="no_header_ppm_result4.tab" />
+        </test>
+        <test>
+            <param name="f1" value="file1_header.tab" ftype="tabular"/>
+            <param name="f2" value="file2_header.tab" ftype="tabular"/>
+            <param name="c1" value="1"/>
+            <param name="c2" value="1"/>
+            <param name="header" value="true"/>
+            <param name="closest" value="true"/>
+            <param name="add_distance" value="true"/>
+            <output name="merged_file" file="header_closest_result5.tab" />
+        </test>
+        <test>
+            <param name="f1" value="file1_ppm.tab" ftype="tabular"/>
+            <param name="f2" value="file2_ppm.tab" ftype="tabular"/>
+            <param name="c1" value="1"/>
+            <param name="c2" value="1"/>
+            <param name="header" value="false"/>
+            <param name="merge_mode_select" value="distance"/>
+            <param name="distance" value="100"/>
+            <param name="units" value="ppm"/>
+            <param name="add_distance" value="true"/>
+            <output name="merged_file" file="no_header_ppm_result6.tab" />
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+
+Join two files on a common column. It is possible to provide an allowed difference between both values (currently only numbers)
+as the absolute differece or as PPM. 
+
+Two modes are available: 
+
+1. In the **best match** mode only the rows are merged for the most similar (or identical) values. In case of multiple best matches, only the first one is reported.
+
+1. The **Matching with a defined distance** option will offer you the possibility
+to provide a distance between the two values of the columns. Is the calculates distance smaller or equal than the given distance the columns will be joined. You can specify the allowed distance as an absolute distance or as PPM.
+
+
+]]>
+    </help>
+    <citations>
+    </citations>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/file1.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,10 @@
+1	one
+2	two
+3	three
+4	four
+5	five
+6	six
+7	seven
+8	eight
+9	nine 
+10	ten
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/file1_header.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,11 @@
+#number1	desc1
+1	one
+2	two
+3	three
+4	four
+5	five
+6	six
+7	seven
+8	eight
+9	nine 
+10	ten
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/file1_ppm.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,3 @@
+1221.12	first entry
+1973.54	second entry
+2233.44	third entry
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/file2.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,43 @@
+1.1	should be true
+1.1	should be true
+1.1	should be true
+1.2	should be false
+1.3	should be false
+1.4	should be false
+1.5	should be false
+1.6	should be false
+1.7	should be false
+1.8	should be false
+1.9	should be false
+2	should be true
+2.1	should be false
+2.2	should be false
+2.3	should be false
+2.4	should be false
+2.5	should be false
+2.6	should be false
+2.7	should be false
+2.8	should be false
+2.9	should be false
+3	should be true
+3.1	should be false
+3.2	should be false
+3.3	should be false
+3.4	should be false
+3.5	should be false
+3.6	should be false
+3.7	should be false
+3.8	should be false
+3.9	should be false
+4	should be true
+4.1	should be false
+4.2	should be false
+4.3	should be false
+4.4	should be false
+4.5	should be false
+4.6	should be false
+4.7	should be false
+4.8	should be false
+4.9	should be false
+5.1	should be true
+10.1	should be true
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/file2_header.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,44 @@
+#number2	desc2
+1.1	should be true
+1.1	should be true
+1.1	should be true
+1.2	should be false
+1.3	should be false
+1.4	should be false
+1.5	should be false
+1.6	should be false
+1.7	should be false
+1.8	should be false
+1.9	should be false
+2	should be true
+2.1	should be false
+2.2	should be false
+2.3	should be false
+2.4	should be false
+2.5	should be false
+2.6	should be false
+2.7	should be false
+2.8	should be false
+2.9	should be false
+3	should be true
+3.1	should be false
+3.2	should be false
+3.3	should be false
+3.4	should be false
+3.5	should be false
+3.6	should be false
+3.7	should be false
+3.8	should be false
+3.9	should be false
+4	should be true
+4.1	should be false
+4.2	should be false
+4.3	should be false
+4.4	should be false
+4.5	should be false
+4.6	should be false
+4.7	should be false
+4.8	should be false
+4.9	should be false
+5.1	should be true
+10.1	should be true
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/file2_ppm.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,11 @@
+1221.13	match1
+1221.11	match2
+1221.15	match3
+1221	match4
+1973.5	match5
+1973.52	match6
+1973.57	match7
+1973.48	match8
+2233.4	match9
+2233.3	match10
+2233.5	match11
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/header_closest_result3.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,7 @@
+#number1	desc1	#number2	desc2
+1	one	1.1	should be true
+2	two	2	should be true
+3	three	3	should be true
+4	four	4	should be true
+5	five	4.9	should be false
+10	ten	10.1	should be true
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/header_closest_result5.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,7 @@
+#number1	desc1	#number2	desc2	absolute
+1	one	1.1	should be true	0.10000000000000009
+2	two	2	should be true	0.0
+3	three	3	should be true	0.0
+4	four	4	should be true	0.0
+5	five	4.9	should be false	0.09999999999999964
+10	ten	10.1	should be true	0.09999999999999964
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/header_result2.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,24 @@
+#number1	desc1	#number2	desc2
+1	one	1.1	should be true
+1	one	1.1	should be true
+1	one	1.1	should be true
+1	one	1.2	should be false
+2	two	1.8	should be false
+2	two	1.9	should be false
+2	two	2	should be true
+2	two	2.1	should be false
+2	two	2.2	should be false
+3	three	2.8	should be false
+3	three	2.9	should be false
+3	three	3	should be true
+3	three	3.1	should be false
+3	three	3.2	should be false
+4	four	3.8	should be false
+4	four	3.9	should be false
+4	four	4	should be true
+4	four	4.1	should be false
+4	four	4.2	should be false
+5	five	4.8	should be false
+5	five	4.9	should be false
+5	five	5.1	should be true
+10	ten	10.1	should be true
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/no_header_ppm_result4.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,11 @@
+1221.12	first entry	1221.13	match1
+1221.12	first entry	1221.11	match2
+1221.12	first entry	1221.15	match3
+1221.12	first entry	1221	match4
+1973.54	second entry	1973.5	match5
+1973.54	second entry	1973.52	match6
+1973.54	second entry	1973.57	match7
+1973.54	second entry	1973.48	match8
+2233.44	third entry	2233.4	match9
+2233.44	third entry	2233.3	match10
+2233.44	third entry	2233.5	match11
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/no_header_ppm_result6.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,11 @@
+1221.12	first entry	1221.13	match1	8.189203354476447
+1221.12	first entry	1221.11	match2	8.189203354290248
+1221.12	first entry	1221.15	match3	24.56761006305694
+1221.12	first entry	1221	match4	98.27044025148295
+1973.54	second entry	1973.5	match5	20.268147592632335
+1973.54	second entry	1973.52	match6	10.134073796316168
+1973.54	second entry	1973.57	match7	15.201110694474252
+1973.54	second entry	1973.48	match8	30.402221388948504
+2233.44	third entry	2233.4	match9	17.909592377661195
+2233.44	third entry	2233.3	match10	62.68357332181419
+2233.44	third entry	2233.5	match11	26.864388566491794
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/no_header_result1.tab	Sun Nov 26 16:13:51 2017 -0500
@@ -0,0 +1,15 @@
+1	one	1.1	should be true
+1	one	1.1	should be true
+1	one	1.1	should be true
+2	two	1.9	should be false
+2	two	2	should be true
+2	two	2.1	should be false
+3	three	2.9	should be false
+3	three	3	should be true
+3	three	3.1	should be false
+4	four	3.9	should be false
+4	four	4	should be true
+4	four	4.1	should be false
+5	five	4.9	should be false
+5	five	5.1	should be true
+10	ten	10.1	should be true