changeset 0:d43312f961cc draft default tip

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/split_tabular_columns commit 1d5750b99b90bb1d2730c816a95849e9b9a7d2f9-dirty
author jjohnson
date Wed, 01 Mar 2017 14:01:57 -0500
parents
children
files split_tabular_columns.py split_tabular_columns.xml text-data/input.tabular text-data/output.tabular
diffstat 4 files changed, 170 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/split_tabular_columns.py	Wed Mar 01 14:01:57 2017 -0500
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                                                 University of Minnesota
+#                 Copyright 2016, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#    James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+
+"""
+Split selected columns on pattern
+and print a line for each item split
+
+For example:
+split_tabular_columns.py -c 3 -c 4 -s '; '
+with input line:
+1	1.3	id1; id2	desc1; desc2	AMDLID
+will be output as:
+1	1.3	id1	desc1	AMDLID
+1	1.3	id2	desc2	AMDLID
+"""
+
+import sys
+import os.path
+import optparse
+from optparse import OptionParser
+
+
+def __main__():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option('-i', '--input', dest='input', default=None, help='Tabular input file')
+    parser.add_option('-o', '--output', dest='output', default=None, help='Tabular output file')
+    parser.add_option('-c', '--column', type='int', action='append', dest='column', default=[], help='column ordinal to split')
+    parser.add_option('-s', '--split_on', dest='split_on', default=' ', help='String on which to split columns')
+    parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
+    (options, args) = parser.parse_args()
+    # Input file
+    if options.input is not None:
+        try:
+            inputPath = os.path.abspath(options.input)
+            inputFile = open(inputPath, 'r')
+        except Exception, e:
+            print >> sys.stderr, "failed: %s" % e
+            exit(2)
+    else:
+        inputFile = sys.stdin
+    # Output file
+    if options.output is not None:
+        try:
+            outputPath = os.path.abspath(options.output)
+            outputFile = open(outputPath, 'w')
+        except Exception, e:
+            print >> sys.stderr, "failed: %s" % e
+            exit(3)
+    else:
+        outputFile = sys.stdout
+    split_cols = [x - 1 for x in options.column]
+    split_on = options.split_on
+    try:
+        for i, line in enumerate(inputFile):
+            fields = line.rstrip('\r\n').split('\t')
+            split_fields = dict()
+            cnt = 0
+            for c in split_cols:
+                if c < len(fields):
+                    split_fields[c] = fields[c].split(split_on)
+                    cnt = max(cnt, len(split_fields[c]))
+            if cnt == 0:
+                print >> outputFile, "%s" % '\t'.join(fields)
+            else:
+                for n in range(0, cnt):
+                    flds = [x if c not in split_cols else split_fields[c][n] for (c, x) in enumerate(fields)]
+                    print >> outputFile, "%s" % '\t'.join(flds)
+    except Exception, e:
+        print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin', e)
+        exit(1)
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/split_tabular_columns.xml	Wed Mar 01 14:01:57 2017 -0500
@@ -0,0 +1,71 @@
+<tool id="split_tabular_columns" name="Split Tabular Columns" version="0.0.1">
+    <description>into multiple rows to create a normalized table</description>
+    <requirements>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <command interpreter="python"><![CDATA[
+        #set $delim = str($split_on).replace("'","\'")
+        split_tabular_columns.py 
+          --input="$input"
+          --split_on='${delim}'
+          #for $col in str($columns).split(","):
+          --column=$col
+          #end for
+          --output="$output"
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="tabular" label="Tabular Dataset to normalize"/>
+        <param name="columns" type="data_column" data_ref="input" multiple="True" label="Columns to split"
+               help=""/>
+        <param name="split_on" type="text" value="," label="List delimiter in column">
+            <sanitizer sanitize="False"/>
+            <validator type="regex" message="Anything but TAB or Newline">^[^\t\n\r\f\v]+$</validator>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="${input.name} normalized"/>
+    </outputs>
+    <tests>
+
+        <test>
+            <param name="input" ftype="tabular" value="input.tabular"/>
+            <param name="columns" value="7,8"/>
+            <param name="split_on" value="; "/>
+            <output name="output" file="output.tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+=====================
+Split Tabular Columns
+=====================
+
+**Inputs**
+
+  Tabular dataset containing one or more columns with a list.  
+
+**Outputs**
+
+  Normalized Tabular dataset containing one or more columns with a list.  
+
+
+**Example**
+
+  With List delimiter '; ' and split on columns c3 and c4
+
+  the input line:
+
+  ::
+
+    	1	1.3	id1; id2	desc1; desc2	AMDLID
+
+  will be output as 2 lines:
+
+  ::
+
+    	1	1.3	id1	desc1	AMDLID
+    	1	1.3	id2	desc2	AMDLID
+
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/text-data/input.tabular	Wed Mar 01 14:01:57 2017 -0500
@@ -0,0 +1,4 @@
+597	16.21	16.26	50.9899973869324	35.8900010585785	33.6600005626678	ENST00000263025.8_18	[11 - 1222] ensembl_havana_transcript:known chromosome:GRCh38:16:30114105:30123294:-1 gene:ENSG00000102882.11 gene_biotype:protein_coding transcript_biotype:protein_coding			0.000869458774104714	99.0000009536743	AAAAAQGGGGGEPRR	Acetyl@N-term; Arg-loss@C-term	Acetyl@27	cleaved M-A@N-term; missed R-R@14	-0.000493265979457647	1210.56872558594	606.2916	1210.56909179688	606.291809082031	2	17	3.1.1.46215.1	0.9097167					
+170	38.5	38.52	57.8000009059906	51.0599970817566	49.1100013256073	ENST00000375436.8_24; ENST00000375433.3_22	[63 - 1754] ensembl_havana_transcript:known chromosome:GRCh38:1:17406760:17439724:-1 gene:ENSG00000179051.13 gene_biotype:protein_coding transcript_biotype:protein_coding; [3 - 1613] havana:known chromosome:GRCh38:1:17408676:17438561:-1 gene:ENSG00000179051.13 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAAAAWEEPSSGNGTAR	Cation:K(E)@7; Deamidated(N)@13			-0.00672993017360568	1683.68225097656	562.2347	1683.68908691406	562.236999511719	3	22	6.1.1.74990.1	1.405767					
+2190	2.01	2.01	19.0999999642372	7.29200020432472	7.29200020432472	ENST00000242285.10_9; ENST00000540080.5_5; ENST00000538225.5_8; ENST00000470744.5_8; ENST00000433436.6_9; ENST00000345519.9_8	[1 - 864] havana:known chromosome:GRCh38:9:36190937:36212059:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [61 - 699] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [61 - 891] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [3 - 815] havana:known chromosome:GRCh38:9:36190932:36212058:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [61 - 945] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [3 - 788] ensembl_havana_transcript:known chromosome:GRCh38:9:36190923:36212056:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAEEAFVNDIDESSPGTEWER			cleaved Q-A@N-term	0.00188760994933546	2351.02099609375	784.6809	2351.01904296875	784.680236816406	3	21	13.1.1.72954.1	1.4438					
+340	25.71	25.75	48.7399995326996	35.5599999427795	35.5599999427795	ENST00000372289.6_25	[3 - 1664] ensembl_havana_transcript:known chromosome:GRCh38:1:44213491:44220681:1 gene:ENSG00000178028.13 gene_biotype:protein_coding transcript_biotype:protein_coding			2	99.0000009536743	AAEEGKDYPFAR			missed K-D@6	-0.00249932007864118	1352.63342285156	451.8851	1352.63610839844	451.885955810547	3	16	6.1.1.34832.1	0.6571					
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/text-data/output.tabular	Wed Mar 01 14:01:57 2017 -0500
@@ -0,0 +1,10 @@
+597	16.21	16.26	50.9899973869324	35.8900010585785	33.6600005626678	ENST00000263025.8_18	[11 - 1222] ensembl_havana_transcript:known chromosome:GRCh38:16:30114105:30123294:-1 gene:ENSG00000102882.11 gene_biotype:protein_coding transcript_biotype:protein_coding			0.000869458774104714	99.0000009536743	AAAAAQGGGGGEPRR	Acetyl@N-term; Arg-loss@C-term	Acetyl@27	cleaved M-A@N-term; missed R-R@14	-0.000493265979457647	1210.56872558594	606.2916	1210.56909179688	606.291809082031	2	17	3.1.1.46215.1	0.9097167					
+170	38.5	38.52	57.8000009059906	51.0599970817566	49.1100013256073	ENST00000375436.8_24	[63 - 1754] ensembl_havana_transcript:known chromosome:GRCh38:1:17406760:17439724:-1 gene:ENSG00000179051.13 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAAAAWEEPSSGNGTAR	Cation:K(E)@7; Deamidated(N)@13			-0.00672993017360568	1683.68225097656	562.2347	1683.68908691406	562.236999511719	3	22	6.1.1.74990.1	1.405767					
+170	38.5	38.52	57.8000009059906	51.0599970817566	49.1100013256073	ENST00000375433.3_22	[3 - 1613] havana:known chromosome:GRCh38:1:17408676:17438561:-1 gene:ENSG00000179051.13 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAAAAWEEPSSGNGTAR	Cation:K(E)@7; Deamidated(N)@13			-0.00672993017360568	1683.68225097656	562.2347	1683.68908691406	562.236999511719	3	22	6.1.1.74990.1	1.405767					
+2190	2.01	2.01	19.0999999642372	7.29200020432472	7.29200020432472	ENST00000242285.10_9	[1 - 864] havana:known chromosome:GRCh38:9:36190937:36212059:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAEEAFVNDIDESSPGTEWER			cleaved Q-A@N-term	0.00188760994933546	2351.02099609375	784.6809	2351.01904296875	784.680236816406	3	21	13.1.1.72954.1	1.4438					
+2190	2.01	2.01	19.0999999642372	7.29200020432472	7.29200020432472	ENST00000540080.5_5	[61 - 699] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAEEAFVNDIDESSPGTEWER			cleaved Q-A@N-term	0.00188760994933546	2351.02099609375	784.6809	2351.01904296875	784.680236816406	3	21	13.1.1.72954.1	1.4438					
+2190	2.01	2.01	19.0999999642372	7.29200020432472	7.29200020432472	ENST00000538225.5_8	[61 - 891] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAEEAFVNDIDESSPGTEWER			cleaved Q-A@N-term	0.00188760994933546	2351.02099609375	784.6809	2351.01904296875	784.680236816406	3	21	13.1.1.72954.1	1.4438					
+2190	2.01	2.01	19.0999999642372	7.29200020432472	7.29200020432472	ENST00000470744.5_8	[3 - 815] havana:known chromosome:GRCh38:9:36190932:36212058:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAEEAFVNDIDESSPGTEWER			cleaved Q-A@N-term	0.00188760994933546	2351.02099609375	784.6809	2351.01904296875	784.680236816406	3	21	13.1.1.72954.1	1.4438					
+2190	2.01	2.01	19.0999999642372	7.29200020432472	7.29200020432472	ENST00000433436.6_9	[61 - 945] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAEEAFVNDIDESSPGTEWER			cleaved Q-A@N-term	0.00188760994933546	2351.02099609375	784.6809	2351.01904296875	784.680236816406	3	21	13.1.1.72954.1	1.4438					
+2190	2.01	2.01	19.0999999642372	7.29200020432472	7.29200020432472	ENST00000345519.9_8	[3 - 788] ensembl_havana_transcript:known chromosome:GRCh38:9:36190923:36212056:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding			0	99.0000009536743	AAEEAFVNDIDESSPGTEWER			cleaved Q-A@N-term	0.00188760994933546	2351.02099609375	784.6809	2351.01904296875	784.680236816406	3	21	13.1.1.72954.1	1.4438					
+340	25.71	25.75	48.7399995326996	35.5599999427795	35.5599999427795	ENST00000372289.6_25	[3 - 1664] ensembl_havana_transcript:known chromosome:GRCh38:1:44213491:44220681:1 gene:ENSG00000178028.13 gene_biotype:protein_coding transcript_biotype:protein_coding			2	99.0000009536743	AAEEGKDYPFAR			missed K-D@6	-0.00249932007864118	1352.63342285156	451.8851	1352.63610839844	451.885955810547	3	16	6.1.1.34832.1	0.6571