Mercurial > repos > mvdbeek > collection_column_join
changeset 0:4a90bbd2110c draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/collection_column_join commit ac5a5dcefafe63a842e0b04b733cc5ee1177acba-dirty"
author | mvdbeek |
---|---|
date | Mon, 07 Sep 2020 12:50:11 +0000 |
parents | |
children | 06cdbee48b68 |
files | collection_column_join.xml test-data/in_1.tabular test-data/in_1_headerless.tabular test-data/in_2.tabular test-data/in_2_headerless.tabular test-data/in_3.tabular test-data/in_3_headerless.tabular test-data/out_1.tabular test-data/out_2.tabular test-data/out_3.tabular test-data/out_4.tabular tool_test_output_2.json |
diffstat | 12 files changed, 201 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/collection_column_join.xml Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,163 @@ +<tool id="collection_column_join" name="Column Join" version="0.0.3+galaxy1"> + <description>on Collections</description> + <requirements> + <requirement type="package" version="8.25">coreutils</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + #if 'output_shell_script' in str( $include_outputs ).split( "," ): + cp '${collection_column_join_script}' '${script_output}' && + #end if + sh '${collection_column_join_script}' + ]]> + </command> + <configfiles> + <configfile name="collection_column_join_script"><![CDATA[ +#!/bin/sh +touch header0.tmp && +touch output0.tmp && +#set $delimiter = '\t' +#set $left_identifier_column = $identifier_column +#set $tail_offset = int( str( $has_header ) ) + 1 +#for $i, $tabular_item in enumerate( $input_tabular ): + #if $old_col_in_header: + #if $has_header: + head -n ${has_header} "${tabular_item}" | awk '{ n = split(\$0,arr,"${delimiter}"); ctr=1; for(i=1;i<=n;i++){ if( i != $identifier_column ){ if( ctr > 1) {printf("${delimiter}")}; printf( "${tabular_item.element_identifier}_%s", arr[i] ); ctr++ } }; printf( "\n" ); }' > input_header.tmp && + tail -n +${tail_offset} "${tabular_item}" | LC_ALL=C sort -t "${delimiter}" -k $identifier_column > input_file.tmp && + #else: + awk '{ n = split(\$0,arr,"${delimiter}"); ctr=1; for(i=1;i<=n;i++){ if( i != $identifier_column ){ if( ctr > 1) {printf("${delimiter}")}; printf( "${tabular_item.element_identifier}_%s", i ); ctr++ } }; exit }' "${tabular_item}" > input_header.tmp && + LC_ALL=C sort -t "${delimiter}" -k $identifier_column "${tabular_item}" > input_file.tmp && + #end if + #else: + #if $has_header: + head -n ${has_header} "${tabular_item}" | awk '{ n = split(\$0,arr,"${delimiter}"); ctr=1; for(i=1;i<=n;i++){ if( i != $identifier_column ){ if( ctr > 1) {printf("${delimiter}")}; printf( "${tabular_item.element_identifier}" ); ctr++ } }; printf( "\n" ); }' > input_header.tmp && + tail -n +${tail_offset} "${tabular_item}" | LC_ALL=C sort -t "${delimiter}" -k $identifier_column > input_file.tmp && + #else: + awk '{ n = split(\$0,arr,"${delimiter}"); ctr=1; for(i=1;i<=n;i++){ if( i != $identifier_column ){ if( ctr > 1) {printf("${delimiter}")}; printf( "${tabular_item.element_identifier}"); ctr++ } }; exit }' "${tabular_item}" > input_header.tmp && + LC_ALL=C sort -t "${delimiter}" -k $identifier_column "${tabular_item}" > input_file.tmp && + #end if + #end if + #if $i == 0: + mv input_file.tmp output${ ( $i + 1 ) % 2 }.tmp && + #if $has_header: + awk '{ printf \$${identifier_column}; exit }' "${tabular_item}" > header${ $i % 2 }.tmp && + #else: + echo "#KEY" > header${ $i % 2 }.tmp && + #end if + #else: + LC_ALL=C join -o auto -a 1 -a 2 -1 ${left_identifier_column} -2 ${identifier_column} -t "${delimiter}" -e "${fill_char}" output${ $i % 2 }.tmp input_file.tmp > output${ ( $i + 1 ) % 2 }.tmp && + #set $left_identifier_column = 1 + #end if + paste -d "${delimiter}" header${ $i % 2 }.tmp input_header.tmp > header${ ( $i + 1 ) % 2 }.tmp && +#end for +cat header${ ( $i + 1 ) % 2 }.tmp output${ ( $i + 1 ) % 2 }.tmp > "${tabular_output}" + ]]> + </configfile> + </configfiles> + <inputs> + <param name="input_tabular" type="data" format="tabular" multiple="True" optional="False" label="Tabular files"/> + <!-- <param name="identifier_column" type="data_column" data_ref="input_tabular" value="0" min="0" optional="False" label="Identifier column"/> --> + <param name="identifier_column" type="integer" value="1" min="0" optional="False" label="Identifier column" help="The column that will be used to join the input datasets"/> + <param name="has_header" type="integer" value="0" min="0" optional="False" label="Number of header lines in each input file" help="If this is set to 0, a header line will be added containing column names as follows: the identifier column will be named #KEY and the other columns are named by the input dataset names/columns. If you have one or more header lines in your input, set this to the number of header lines."/> + <param name="old_col_in_header" type="boolean" checked="true" label="Add column name to header" help="Disable if you want column headers to only be composed of the input file names, for example, if you want headers like file1 and not file1_column1, see Help section below. Default: Yes"/> + <param name="fill_char" type="text" value="." optional="False" label="Fill character"/> + <param name="include_outputs" type="select" multiple="True" label="Additional datasets to create"> + <option value="output_shell_script" selected="false">Shell script</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="tabular_output"/> + <data format="txt" name="script_output"> + <filter>include_outputs and "output_shell_script" in include_outputs</filter> + </data> + </outputs> + <tests> + <test> + <param name="input_tabular" value="in_1.tabular,in_2.tabular,in_3.tabular" ftype="tabular"/> + <param name="identifier_column" value="1"/> + <param name="has_header" value="1"/> + <param name="old_col_in_header" value="true"/> + <param name="fill_char" value="."/> + <param name="include_outputs" /> + <output name="tabular_output" file="out_1.tabular" ftype="tabular"/> + </test> + <test> + <param name="input_tabular" value="in_1_headerless.tabular,in_2_headerless.tabular,in_3_headerless.tabular" ftype="tabular"/> + <param name="identifier_column" value="1"/> + <param name="has_header" value="0"/> + <param name="old_col_in_header" value="true"/> + <param name="fill_char" value="."/> + <param name="include_outputs" /> + <output name="tabular_output" file="out_2.tabular" ftype="tabular"/> + </test> + <test> + <param name="input_tabular" value="in_1.tabular,in_2.tabular,in_3.tabular" ftype="tabular"/> + <param name="identifier_column" value="1"/> + <param name="has_header" value="1"/> + <param name="old_col_in_header" value="false"/> + <param name="fill_char" value="."/> + <param name="include_outputs" /> + <output name="tabular_output" file="out_3.tabular" ftype="tabular"/> + </test> + <test> + <param name="input_tabular" value="in_1_headerless.tabular,in_2_headerless.tabular,in_3_headerless.tabular" ftype="tabular"/> + <param name="identifier_column" value="1"/> + <param name="has_header" value="0"/> + <param name="old_col_in_header" value="false"/> + <param name="fill_char" value="."/> + <param name="include_outputs" /> + <output name="tabular_output" file="out_4.tabular" ftype="tabular"/> + </test> + </tests> + <help> + <![CDATA[ +Joins lists of tabular datasets together on a field. + +----- + +**Example** + +To join three files, with headers, based on the first column: + +**First file (in_1)**:: + + #KEY c2 c3 c4 + one 1-1 1-2 1-3 + two 1-4 1-5 1-6 + three 1-7 1-8 1-9 + + +**Second File (in_2)**:: + + #KEY c2 c3 c4 + one 2-1 2-2 2-3 + two 2-4 2-5 2-6 + three 2-7 2-8 2-9 + +**Third file (in_3)**:: + + #KEY c2 c3 c4 + one 3-3 3-2 3-3 + two 3-4 3-5 3-6 + three 3-7 3-8 3-9 + + +**Joining** the files, using **identifier column of 1** and a **header lines of 1**, will return:: + + #KEY in_1_c2 in_1_c3 in_1_c4 in_2_c2 in_2_c3 in_2_c4 in_3_c2 in_3_c3 in_3_c4 + one 1-1 1-2 1-3 2-1 2-2 2-3 3-3 3-2 3-3 + three 1-7 1-8 1-9 2-7 2-8 2-9 3-7 3-8 3-9 + two 1-4 1-5 1-6 2-4 2-5 2-6 3-4 3-5 3-6 + + +**Joining** the files, using **identifier column of 1** and a **header lines of 1**, but disabling **Add column name to header**, will return:: + + #KEY in_1 in_1 in_1 in_2 in_2 in_2 in_3 in_3 in_3 + one 1-1 1-2 1-3 2-1 2-2 2-3 3-3 3-2 3-3 + three 1-7 1-8 1-9 2-7 2-8 2-9 3-7 3-8 3-9 + two 1-4 1-5 1-6 2-4 2-5 2-6 3-4 3-5 3-6 + + ]]> + </help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/in_1.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,4 @@ +#KEY c2 c3 c4 +one 1-1 1-2 1-3 +two 1-4 1-5 1-6 +three 1-7 1-8 1-9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/in_1_headerless.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,3 @@ +one 1-1 1-2 1-3 +two 1-4 1-5 1-6 +three 1-7 1-8 1-9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/in_2.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,4 @@ +#KEY c2 c3 c4 +one 2-1 2-2 2-3 +two 2-4 2-5 2-6 +three 2-7 2-8 2-9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/in_2_headerless.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,3 @@ +one 2-1 2-2 2-3 +two 2-4 2-5 2-6 +three 2-7 2-8 2-9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/in_3.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,4 @@ +#KEY c2 c3 c4 +one 3-3 3-2 3-3 +two 3-4 3-5 3-6 +three 3-7 3-8 3-9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/in_3_headerless.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,3 @@ +one 3-3 3-2 3-3 +two 3-4 3-5 3-6 +three 3-7 3-8 3-9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out_1.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,4 @@ +#KEY in_1.tabular_c2 in_1.tabular_c3 in_1.tabular_c4 in_2.tabular_c2 in_2.tabular_c3 in_2.tabular_c4 in_3.tabular_c2 in_3.tabular_c3 in_3.tabular_c4 +one 1-1 1-2 1-3 2-1 2-2 2-3 3-3 3-2 3-3 +three 1-7 1-8 1-9 2-7 2-8 2-9 3-7 3-8 3-9 +two 1-4 1-5 1-6 2-4 2-5 2-6 3-4 3-5 3-6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out_2.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,4 @@ +#KEY in_1_headerless.tabular_2 in_1_headerless.tabular_3 in_1_headerless.tabular_4 in_2_headerless.tabular_2 in_2_headerless.tabular_3 in_2_headerless.tabular_4 in_3_headerless.tabular_2 in_3_headerless.tabular_3 in_3_headerless.tabular_4 +one 1-1 1-2 1-3 2-1 2-2 2-3 3-3 3-2 3-3 +three 1-7 1-8 1-9 2-7 2-8 2-9 3-7 3-8 3-9 +two 1-4 1-5 1-6 2-4 2-5 2-6 3-4 3-5 3-6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out_3.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,4 @@ +#KEY in_1.tabular in_1.tabular in_1.tabular in_2.tabular in_2.tabular in_2.tabular in_3.tabular in_3.tabular in_3.tabular +one 1-1 1-2 1-3 2-1 2-2 2-3 3-3 3-2 3-3 +three 1-7 1-8 1-9 2-7 2-8 2-9 3-7 3-8 3-9 +two 1-4 1-5 1-6 2-4 2-5 2-6 3-4 3-5 3-6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out_4.tabular Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,4 @@ +#KEY in_1_headerless.tabular in_1_headerless.tabular in_1_headerless.tabular in_2_headerless.tabular in_2_headerless.tabular in_2_headerless.tabular in_3_headerless.tabular in_3_headerless.tabular in_3_headerless.tabular +one 1-1 1-2 1-3 2-1 2-2 2-3 3-3 3-2 3-3 +three 1-7 1-8 1-9 2-7 2-8 2-9 3-7 3-8 3-9 +two 1-4 1-5 1-6 2-4 2-5 2-6 3-4 3-5 3-6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_test_output_2.json Mon Sep 07 12:50:11 2020 +0000 @@ -0,0 +1,1 @@ +{"version": "0.1", "tests": [{"id": "functional.test_toolbox.TestForTool_collection_column_join.test_tool_000000", "has_data": true, "data": {"tool_id": "collection_column_join", "tool_version": "0.0.3", "test_index": 0, "time_seconds": 32.53955006599426, "inputs": {"input_tabular": [{"src": "hda", "id": "2891970512fa2d5a"}, {"src": "hda", "id": "5729865256bc2525"}, {"src": "hda", "id": "54f2a3a23292eb07"}], "identifier_column": "1", "has_header": "1", "old_col_in_header": true, "fill_char": ".", "include_outputs": null}, "job": {"command_line": "sh '/tmp/tmpv5rtmokb/job_working_directory/000/4/tmpgezz2sse'", "create_time": "2019-11-20T13:29:35.595219", "exit_code": 0, "external_id": "205", "galaxy_version": "19.09", "history_id": "2891970512fa2d5a", "id": "8155e4b4bf1581ff", "inputs": {"input_tabular": {"id": "2891970512fa2d5a", "src": "hda", "uuid": "7b5f3b0a-2fda-4bce-b8e6-544cd96901ff"}, "input_tabular1": {"id": "2891970512fa2d5a", "src": "hda", "uuid": "7b5f3b0a-2fda-4bce-b8e6-544cd96901ff"}, "input_tabular2": {"id": "5729865256bc2525", "src": "hda", "uuid": "97998a2d-fae2-43ac-bde7-f23c071e7267"}, "input_tabular3": {"id": "54f2a3a23292eb07", "src": "hda", "uuid": "d91ea98f-8149-4e84-8ca5-52e75ebd25a0"}}, "job_messages": [], "job_metrics": [], "job_stderr": "", "job_stdout": "", "model_class": "Job", "outputs": {"tabular_output": {"id": "8155e4b4bf1581ff", "src": "hda", "uuid": "e3d79ed7-8784-4d41-bffc-7793f8133acb"}}, "params": {"chromInfo": "\"/galaxy/tool-data/shared/ucsc/chrom/?.len\"", "dbkey": "\"?\"", "fill_char": "\".\"", "has_header": "\"1\"", "identifier_column": "\"1\"", "include_outputs": "null", "old_col_in_header": "\"true\""}, "state": "ok", "stderr": "", "stdout": "", "tool_id": "collection_column_join", "tool_stderr": "", "tool_stdout": "", "update_time": "2019-11-20T13:29:38.873935", "user_email": "test@bx.psu.edu"}, "status": "success"}}, {"id": "functional.test_toolbox.TestForTool_collection_column_join.test_tool_000001", "has_data": true, "data": {"tool_id": "collection_column_join", "tool_version": "0.0.3", "test_index": 1, "time_seconds": 28.99092698097229, "inputs": {"input_tabular": [{"src": "hda", "id": "7b55dbb89df8f4e5"}, {"src": "hda", "id": "fa6d20d0fb68383f"}, {"src": "hda", "id": "683bc220e21425bb"}], "identifier_column": "1", "has_header": "0", "old_col_in_header": true, "fill_char": ".", "include_outputs": null}, "job": {"command_line": "sh '/tmp/tmpv5rtmokb/job_working_directory/000/8/tmp5f73g0w9'", "create_time": "2019-11-20T13:30:07.921565", "exit_code": 0, "external_id": "347", "galaxy_version": "19.09", "history_id": "5729865256bc2525", "id": "a90a30fafe298e1e", "inputs": {"input_tabular": {"id": "7b55dbb89df8f4e5", "src": "hda", "uuid": "6890a6b0-8fda-408d-99da-3b0bae9293c7"}, "input_tabular1": {"id": "7b55dbb89df8f4e5", "src": "hda", "uuid": "6890a6b0-8fda-408d-99da-3b0bae9293c7"}, "input_tabular2": {"id": "fa6d20d0fb68383f", "src": "hda", "uuid": "07875730-4b60-4ae3-913e-134e51b8ecb7"}, "input_tabular3": {"id": "683bc220e21425bb", "src": "hda", "uuid": "b9bd190a-8400-4d77-b347-88454a70061d"}}, "job_messages": [], "job_metrics": [], "job_stderr": "", "job_stdout": "", "model_class": "Job", "outputs": {"tabular_output": {"id": "a90a30fafe298e1e", "src": "hda", "uuid": "bf86ec20-b584-43f7-919f-9ab4c2d38af8"}}, "params": {"chromInfo": "\"/galaxy/tool-data/shared/ucsc/chrom/?.len\"", "dbkey": "\"?\"", "fill_char": "\".\"", "has_header": "\"0\"", "identifier_column": "\"1\"", "include_outputs": "null", "old_col_in_header": "\"true\""}, "state": "ok", "stderr": "", "stdout": "", "tool_id": "collection_column_join", "tool_stderr": "", "tool_stdout": "", "update_time": "2019-11-20T13:30:11.124008", "user_email": "test@bx.psu.edu"}, "status": "success"}}, {"id": "functional.test_toolbox.TestForTool_collection_column_join.test_tool_000002", "has_data": true, "data": {"tool_id": "collection_column_join", "tool_version": "0.0.3", "test_index": 2, "time_seconds": 32.19673824310303, "inputs": {"input_tabular": [{"src": "hda", "id": "b842d972534ccb3e"}, {"src": "hda", "id": "5449172d6ff5669b"}, {"src": "hda", "id": "9ce08b2254e4d5ed"}], "identifier_column": "1", "has_header": "1", "old_col_in_header": false, "fill_char": ".", "include_outputs": null}, "job": {"command_line": "sh '/tmp/tmpv5rtmokb/job_working_directory/000/12/tmp8o9h4tjj'", "create_time": "2019-11-20T13:30:36.896777", "exit_code": 0, "external_id": "482", "galaxy_version": "19.09", "history_id": "54f2a3a23292eb07", "id": "80b8022ff3f677b7", "inputs": {"input_tabular": {"id": "b842d972534ccb3e", "src": "hda", "uuid": "ba0e4c0c-d0bf-4d93-9f6b-c1be07144dd9"}, "input_tabular1": {"id": "b842d972534ccb3e", "src": "hda", "uuid": "ba0e4c0c-d0bf-4d93-9f6b-c1be07144dd9"}, "input_tabular2": {"id": "5449172d6ff5669b", "src": "hda", "uuid": "5f35d996-61cf-4a9c-b940-bd19327aa116"}, "input_tabular3": {"id": "9ce08b2254e4d5ed", "src": "hda", "uuid": "e9a2df33-bb1e-4181-8d8e-f8dae2f2cfe1"}}, "job_messages": [], "job_metrics": [], "job_stderr": "", "job_stdout": "", "model_class": "Job", "outputs": {"tabular_output": {"id": "80b8022ff3f677b7", "src": "hda", "uuid": "975835d7-0306-439c-93fc-c34a737444bd"}}, "params": {"chromInfo": "\"/galaxy/tool-data/shared/ucsc/chrom/?.len\"", "dbkey": "\"?\"", "fill_char": "\".\"", "has_header": "\"1\"", "identifier_column": "\"1\"", "include_outputs": "null", "old_col_in_header": "\"false\""}, "state": "ok", "stderr": "", "stdout": "", "tool_id": "collection_column_join", "tool_stderr": "", "tool_stdout": "", "update_time": "2019-11-20T13:30:40.219905", "user_email": "test@bx.psu.edu"}, "status": "success"}}, {"id": "functional.test_toolbox.TestForTool_collection_column_join.test_tool_000003", "has_data": true, "data": {"tool_id": "collection_column_join", "tool_version": "0.0.3", "test_index": 3, "time_seconds": 28.97074604034424, "inputs": {"input_tabular": [{"src": "hda", "id": "b54fb481e575bccc"}, {"src": "hda", "id": "1ae74d26531588b0"}, {"src": "hda", "id": "440a6c2b5d9efe20"}], "identifier_column": "1", "has_header": "0", "old_col_in_header": false, "fill_char": ".", "include_outputs": null}, "job": {"command_line": "sh '/tmp/tmpv5rtmokb/job_working_directory/000/16/tmpygwwxko5'", "create_time": "2019-11-20T13:31:09.181482", "exit_code": 0, "external_id": "624", "galaxy_version": "19.09", "history_id": "8155e4b4bf1581ff", "id": "ea0b941dfbe636f8", "inputs": {"input_tabular": {"id": "b54fb481e575bccc", "src": "hda", "uuid": "97283a56-ca19-49b9-a008-8acc58dd7090"}, "input_tabular1": {"id": "b54fb481e575bccc", "src": "hda", "uuid": "97283a56-ca19-49b9-a008-8acc58dd7090"}, "input_tabular2": {"id": "1ae74d26531588b0", "src": "hda", "uuid": "28b7c855-37a9-4dc4-bcc4-bd1a795a4801"}, "input_tabular3": {"id": "440a6c2b5d9efe20", "src": "hda", "uuid": "ce55cc4e-eb52-465a-953c-ac49fbbf58e1"}}, "job_messages": [], "job_metrics": [], "job_stderr": "", "job_stdout": "", "model_class": "Job", "outputs": {"tabular_output": {"id": "ea0b941dfbe636f8", "src": "hda", "uuid": "8fe2159d-3e23-4d5c-bb95-21aa5e2daca3"}}, "params": {"chromInfo": "\"/galaxy/tool-data/shared/ucsc/chrom/?.len\"", "dbkey": "\"?\"", "fill_char": "\".\"", "has_header": "\"0\"", "identifier_column": "\"1\"", "include_outputs": "null", "old_col_in_header": "\"false\""}, "state": "ok", "stderr": "", "stdout": "", "tool_id": "collection_column_join", "tool_stderr": "", "tool_stdout": "", "update_time": "2019-11-20T13:31:12.320556", "user_email": "test@bx.psu.edu"}, "status": "success"}}], "summary": {"num_tests": 4, "num_failures": 0, "num_errors": 0, "num_skips": 0}, "exit_code": 0} \ No newline at end of file