Mercurial > repos > jjohnson > split_tabular_columns
changeset 0:d43312f961cc draft default tip
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/split_tabular_columns commit 1d5750b99b90bb1d2730c816a95849e9b9a7d2f9-dirty
author | jjohnson |
---|---|
date | Wed, 01 Mar 2017 14:01:57 -0500 |
parents | |
children | |
files | split_tabular_columns.py split_tabular_columns.xml text-data/input.tabular text-data/output.tabular |
diffstat | 4 files changed, 170 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/split_tabular_columns.py Wed Mar 01 14:01:57 2017 -0500 @@ -0,0 +1,85 @@ +#!/usr/bin/env python +""" +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2016, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# James E Johnson +# +#------------------------------------------------------------------------------ +""" + +""" +Split selected columns on pattern +and print a line for each item split + +For example: +split_tabular_columns.py -c 3 -c 4 -s '; ' +with input line: +1 1.3 id1; id2 desc1; desc2 AMDLID +will be output as: +1 1.3 id1 desc1 AMDLID +1 1.3 id2 desc2 AMDLID +""" + +import sys +import os.path +import optparse +from optparse import OptionParser + + +def __main__(): + # Parse Command Line + parser = optparse.OptionParser() + parser.add_option('-i', '--input', dest='input', default=None, help='Tabular input file') + parser.add_option('-o', '--output', dest='output', default=None, help='Tabular output file') + parser.add_option('-c', '--column', type='int', action='append', dest='column', default=[], help='column ordinal to split') + parser.add_option('-s', '--split_on', dest='split_on', default=' ', help='String on which to split columns') + parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') + (options, args) = parser.parse_args() + # Input file + if options.input is not None: + try: + inputPath = os.path.abspath(options.input) + inputFile = open(inputPath, 'r') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(2) + else: + inputFile = sys.stdin + # Output file + if options.output is not None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + else: + outputFile = sys.stdout + split_cols = [x - 1 for x in options.column] + split_on = options.split_on + try: + for i, line in enumerate(inputFile): + fields = line.rstrip('\r\n').split('\t') + split_fields = dict() + cnt = 0 + for c in split_cols: + if c < len(fields): + split_fields[c] = fields[c].split(split_on) + cnt = max(cnt, len(split_fields[c])) + if cnt == 0: + print >> outputFile, "%s" % '\t'.join(fields) + else: + for n in range(0, cnt): + flds = [x if c not in split_cols else split_fields[c][n] for (c, x) in enumerate(fields)] + print >> outputFile, "%s" % '\t'.join(flds) + except Exception, e: + print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin', e) + exit(1) + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/split_tabular_columns.xml Wed Mar 01 14:01:57 2017 -0500 @@ -0,0 +1,71 @@ +<tool id="split_tabular_columns" name="Split Tabular Columns" version="0.0.1"> + <description>into multiple rows to create a normalized table</description> + <requirements> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command interpreter="python"><![CDATA[ + #set $delim = str($split_on).replace("'","\'") + split_tabular_columns.py + --input="$input" + --split_on='${delim}' + #for $col in str($columns).split(","): + --column=$col + #end for + --output="$output" + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Tabular Dataset to normalize"/> + <param name="columns" type="data_column" data_ref="input" multiple="True" label="Columns to split" + help=""/> + <param name="split_on" type="text" value="," label="List delimiter in column"> + <sanitizer sanitize="False"/> + <validator type="regex" message="Anything but TAB or Newline">^[^\t\n\r\f\v]+$</validator> + </param> + </inputs> + <outputs> + <data format="tabular" name="output" label="${input.name} normalized"/> + </outputs> + <tests> + + <test> + <param name="input" ftype="tabular" value="input.tabular"/> + <param name="columns" value="7,8"/> + <param name="split_on" value="; "/> + <output name="output" file="output.tabular"/> + </test> + </tests> + <help><![CDATA[ +===================== +Split Tabular Columns +===================== + +**Inputs** + + Tabular dataset containing one or more columns with a list. + +**Outputs** + + Normalized Tabular dataset containing one or more columns with a list. + + +**Example** + + With List delimiter '; ' and split on columns c3 and c4 + + the input line: + + :: + + 1 1.3 id1; id2 desc1; desc2 AMDLID + + will be output as 2 lines: + + :: + + 1 1.3 id1 desc1 AMDLID + 1 1.3 id2 desc2 AMDLID + + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/text-data/input.tabular Wed Mar 01 14:01:57 2017 -0500 @@ -0,0 +1,4 @@ +597 16.21 16.26 50.9899973869324 35.8900010585785 33.6600005626678 ENST00000263025.8_18 [11 - 1222] ensembl_havana_transcript:known chromosome:GRCh38:16:30114105:30123294:-1 gene:ENSG00000102882.11 gene_biotype:protein_coding transcript_biotype:protein_coding 0.000869458774104714 99.0000009536743 AAAAAQGGGGGEPRR Acetyl@N-term; Arg-loss@C-term Acetyl@27 cleaved M-A@N-term; missed R-R@14 -0.000493265979457647 1210.56872558594 606.2916 1210.56909179688 606.291809082031 2 17 3.1.1.46215.1 0.9097167 +170 38.5 38.52 57.8000009059906 51.0599970817566 49.1100013256073 ENST00000375436.8_24; ENST00000375433.3_22 [63 - 1754] ensembl_havana_transcript:known chromosome:GRCh38:1:17406760:17439724:-1 gene:ENSG00000179051.13 gene_biotype:protein_coding transcript_biotype:protein_coding; [3 - 1613] havana:known chromosome:GRCh38:1:17408676:17438561:-1 gene:ENSG00000179051.13 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAAAAWEEPSSGNGTAR Cation:K(E)@7; Deamidated(N)@13 -0.00672993017360568 1683.68225097656 562.2347 1683.68908691406 562.236999511719 3 22 6.1.1.74990.1 1.405767 +2190 2.01 2.01 19.0999999642372 7.29200020432472 7.29200020432472 ENST00000242285.10_9; ENST00000540080.5_5; ENST00000538225.5_8; ENST00000470744.5_8; ENST00000433436.6_9; ENST00000345519.9_8 [1 - 864] havana:known chromosome:GRCh38:9:36190937:36212059:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [61 - 699] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [61 - 891] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [3 - 815] havana:known chromosome:GRCh38:9:36190932:36212058:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [61 - 945] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding; [3 - 788] ensembl_havana_transcript:known chromosome:GRCh38:9:36190923:36212056:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAEEAFVNDIDESSPGTEWER cleaved Q-A@N-term 0.00188760994933546 2351.02099609375 784.6809 2351.01904296875 784.680236816406 3 21 13.1.1.72954.1 1.4438 +340 25.71 25.75 48.7399995326996 35.5599999427795 35.5599999427795 ENST00000372289.6_25 [3 - 1664] ensembl_havana_transcript:known chromosome:GRCh38:1:44213491:44220681:1 gene:ENSG00000178028.13 gene_biotype:protein_coding transcript_biotype:protein_coding 2 99.0000009536743 AAEEGKDYPFAR missed K-D@6 -0.00249932007864118 1352.63342285156 451.8851 1352.63610839844 451.885955810547 3 16 6.1.1.34832.1 0.6571
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/text-data/output.tabular Wed Mar 01 14:01:57 2017 -0500 @@ -0,0 +1,10 @@ +597 16.21 16.26 50.9899973869324 35.8900010585785 33.6600005626678 ENST00000263025.8_18 [11 - 1222] ensembl_havana_transcript:known chromosome:GRCh38:16:30114105:30123294:-1 gene:ENSG00000102882.11 gene_biotype:protein_coding transcript_biotype:protein_coding 0.000869458774104714 99.0000009536743 AAAAAQGGGGGEPRR Acetyl@N-term; Arg-loss@C-term Acetyl@27 cleaved M-A@N-term; missed R-R@14 -0.000493265979457647 1210.56872558594 606.2916 1210.56909179688 606.291809082031 2 17 3.1.1.46215.1 0.9097167 +170 38.5 38.52 57.8000009059906 51.0599970817566 49.1100013256073 ENST00000375436.8_24 [63 - 1754] ensembl_havana_transcript:known chromosome:GRCh38:1:17406760:17439724:-1 gene:ENSG00000179051.13 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAAAAWEEPSSGNGTAR Cation:K(E)@7; Deamidated(N)@13 -0.00672993017360568 1683.68225097656 562.2347 1683.68908691406 562.236999511719 3 22 6.1.1.74990.1 1.405767 +170 38.5 38.52 57.8000009059906 51.0599970817566 49.1100013256073 ENST00000375433.3_22 [3 - 1613] havana:known chromosome:GRCh38:1:17408676:17438561:-1 gene:ENSG00000179051.13 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAAAAWEEPSSGNGTAR Cation:K(E)@7; Deamidated(N)@13 -0.00672993017360568 1683.68225097656 562.2347 1683.68908691406 562.236999511719 3 22 6.1.1.74990.1 1.405767 +2190 2.01 2.01 19.0999999642372 7.29200020432472 7.29200020432472 ENST00000242285.10_9 [1 - 864] havana:known chromosome:GRCh38:9:36190937:36212059:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAEEAFVNDIDESSPGTEWER cleaved Q-A@N-term 0.00188760994933546 2351.02099609375 784.6809 2351.01904296875 784.680236816406 3 21 13.1.1.72954.1 1.4438 +2190 2.01 2.01 19.0999999642372 7.29200020432472 7.29200020432472 ENST00000540080.5_5 [61 - 699] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAEEAFVNDIDESSPGTEWER cleaved Q-A@N-term 0.00188760994933546 2351.02099609375 784.6809 2351.01904296875 784.680236816406 3 21 13.1.1.72954.1 1.4438 +2190 2.01 2.01 19.0999999642372 7.29200020432472 7.29200020432472 ENST00000538225.5_8 [61 - 891] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAEEAFVNDIDESSPGTEWER cleaved Q-A@N-term 0.00188760994933546 2351.02099609375 784.6809 2351.01904296875 784.680236816406 3 21 13.1.1.72954.1 1.4438 +2190 2.01 2.01 19.0999999642372 7.29200020432472 7.29200020432472 ENST00000470744.5_8 [3 - 815] havana:known chromosome:GRCh38:9:36190932:36212058:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAEEAFVNDIDESSPGTEWER cleaved Q-A@N-term 0.00188760994933546 2351.02099609375 784.6809 2351.01904296875 784.680236816406 3 21 13.1.1.72954.1 1.4438 +2190 2.01 2.01 19.0999999642372 7.29200020432472 7.29200020432472 ENST00000433436.6_9 [61 - 945] ensembl:known chromosome:GRCh38:9:36190856:36212061:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAEEAFVNDIDESSPGTEWER cleaved Q-A@N-term 0.00188760994933546 2351.02099609375 784.6809 2351.01904296875 784.680236816406 3 21 13.1.1.72954.1 1.4438 +2190 2.01 2.01 19.0999999642372 7.29200020432472 7.29200020432472 ENST00000345519.9_8 [3 - 788] ensembl_havana_transcript:known chromosome:GRCh38:9:36190923:36212056:1 gene:ENSG00000122705.16 gene_biotype:protein_coding transcript_biotype:protein_coding 0 99.0000009536743 AAEEAFVNDIDESSPGTEWER cleaved Q-A@N-term 0.00188760994933546 2351.02099609375 784.6809 2351.01904296875 784.680236816406 3 21 13.1.1.72954.1 1.4438 +340 25.71 25.75 48.7399995326996 35.5599999427795 35.5599999427795 ENST00000372289.6_25 [3 - 1664] ensembl_havana_transcript:known chromosome:GRCh38:1:44213491:44220681:1 gene:ENSG00000178028.13 gene_biotype:protein_coding transcript_biotype:protein_coding 2 99.0000009536743 AAEEGKDYPFAR missed K-D@6 -0.00249932007864118 1352.63342285156 451.8851 1352.63610839844 451.885955810547 3 16 6.1.1.34832.1 0.6571