Mercurial > repos > devteam > fasta_to_tabular
changeset 2:091edad7622f draft
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/fasta_to_tabular commit cd1ed08574b749eee2a3f6e6151dbb0c8ca15bbf"
author | devteam |
---|---|
date | Sun, 01 Mar 2020 07:25:01 -0500 |
parents | 7e801ab2b70e |
children | e7ed3c310b74 |
files | fasta_to_tabular.py fasta_to_tabular.xml |
diffstat | 2 files changed, 99 insertions(+), 95 deletions(-) [+] |
line wrap: on
line diff
--- a/fasta_to_tabular.py Wed Nov 11 12:14:09 2015 -0500 +++ b/fasta_to_tabular.py Sun Mar 01 07:25:01 2020 -0500 @@ -6,52 +6,53 @@ format convert: fasta to tabular """ -import sys, os +import sys + -def stop_err( msg ): - sys.stderr.write( msg ) - sys.exit() +def stop_err(msg): + sys.exit(msg) + def __main__(): if len(sys.argv) != 5: stop_err("Wrong number of argument. Expect four (fasta, tabular, truncation, columns)") infile = sys.argv[1] outfile = sys.argv[2] - keep_first = int( sys.argv[3] ) - descr_split = int( sys.argv[4] ) - fasta_title = fasta_seq = '' + keep_first = int(sys.argv[3]) + descr_split = int(sys.argv[4]) if keep_first == 0: keep_first = None elif descr_split == 1: - #Added one for the ">" character - #(which is removed if using descr_split > 1) + # Added one for the ">" character + # (which is removed if using descr_split > 1) keep_first += 1 if descr_split < 1: stop_err("Bad description split value (should be 1 or more)") - out = open( outfile, 'w' ) - for i, line in enumerate( open( infile ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): - continue - if line.startswith( '>' ): - #Don't want any existing tabs to trigger extra columns: - line = line.replace('\t', ' ') - if i > 0: - out.write('\n') - if descr_split == 1: - out.write(line[1:keep_first]) + with open(outfile, 'w') as out, open(infile) as in_fh: + for i, line in enumerate(in_fh): + line = line.rstrip('\r\n') + if not line or line.startswith('#'): + continue + if line.startswith('>'): + # Don't want any existing tabs to trigger extra columns: + line = line.replace('\t', ' ') + if i > 0: + out.write('\n') + if descr_split == 1: + out.write(line[1:keep_first]) + else: + words = line[1:].split(None, descr_split - 1) + # apply any truncation to first word (the id) + words[0] = words[0][0:keep_first] + # pad with empty columns if required + words += [""] * (descr_split - len(words)) + out.write("\t".join(words)) + out.write('\t') else: - words = line[1:].split(None, descr_split-1) - #apply any truncation to first word (the id) - words[0] = words[0][0:keep_first] - #pad with empty columns if required - words += [""]*(descr_split-len(words)) - out.write("\t".join(words)) - out.write('\t') - else: - out.write(line) - if i > 0: - out.write('\n') - out.close() + out.write(line) + if i > 0: + out.write('\n') -if __name__ == "__main__" : __main__() + +if __name__ == "__main__": + __main__()
--- a/fasta_to_tabular.xml Wed Nov 11 12:14:09 2015 -0500 +++ b/fasta_to_tabular.xml Sun Mar 01 07:25:01 2020 -0500 @@ -1,64 +1,67 @@ -<tool id="fasta2tab" name="FASTA-to-Tabular" version="1.1.0"> - <description>converter</description> - <command interpreter="python">fasta_to_tabular.py $input $output $keep_first $descr_columns</command> - <inputs> - <param name="input" type="data" format="fasta" label="Convert these sequences"/> - <param name="descr_columns" type="integer" value="1" label="How many columns to divide title string into?" help="Typically 2 to take the ID (first word) and decription (rest) as two columns, or 1 to give a single column"> - <validator type="in_range" min="1" /> - </param> - <param name="keep_first" type="integer" value="0" label="How many title characters to keep?" help="Applies only to the first column taken from the title string ('0' = keep the whole thing), useful when your sequence identifiers are all the same length."> - <validator type="in_range" min="0" /> - </param> - </inputs> - <outputs> - <data name="output" format="tabular"/> - </outputs> - <tests> - <test> - <param name="input" value="454.fasta" /> - <param name="descr_columns" value="1"/> - <param name="keep_first" value="0"/> - <output name="output" file="fasta_to_tabular_out1.tabular" /> - </test> - - <test> - <param name="input" value="4.fasta" /> - <param name="descr_columns" value="1"/> - <param name="keep_first" value="0"/> - <output name="output" file="fasta_to_tabular_out2.tabular" /> - </test> - - <test> - <param name="input" value="454.fasta" /> - <param name="descr_columns" value="1"/> - <param name="keep_first" value="14"/> - <output name="output" file="fasta_to_tabular_out3.tabular" /> - </test> +<tool id="fasta2tab" name="FASTA-to-Tabular" version="1.1.1" profile="16.04"> + <description>converter</description> + <requirements> + <requirement type="package" version="3.7">python</requirement> + </requirements> + <command> +python '$__tool_directory__/fasta_to_tabular.py' '$input' '$output' $keep_first $descr_columns + </command> + <inputs> + <param name="input" type="data" format="fasta" label="Convert these sequences"/> + <param name="descr_columns" type="integer" value="1" min="1" label="How many columns to divide title string into?" help="Typically 2 to take the ID (first word) and decription (rest) as two columns, or 1 to give a single column"> + </param> + <param name="keep_first" type="integer" value="0" min="0" label="How many title characters to keep?" help="Applies only to the first column taken from the title string ('0' = keep the whole thing), useful when your sequence identifiers are all the same length."> + </param> + </inputs> + <outputs> + <data name="output" format="tabular"/> + </outputs> + <tests> + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="1"/> + <param name="keep_first" value="0"/> + <output name="output" file="fasta_to_tabular_out1.tabular" /> + </test> - <test> - <param name="input" value="454.fasta" /> - <param name="descr_columns" value="2"/> - <param name="keep_first" value="0"/> - <output name="output" file="fasta_to_tabular_out4.tabular" /> - </test> + <test> + <param name="input" value="4.fasta" /> + <param name="descr_columns" value="1"/> + <param name="keep_first" value="0"/> + <output name="output" file="fasta_to_tabular_out2.tabular" /> + </test> + + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="1"/> + <param name="keep_first" value="14"/> + <output name="output" file="fasta_to_tabular_out3.tabular" /> + </test> - <test> - <param name="input" value="454.fasta" /> - <param name="descr_columns" value="5"/> - <param name="keep_first" value="0"/> - <output name="output" file="fasta_to_tabular_out5.tabular" /> - </test> + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="2"/> + <param name="keep_first" value="0"/> + <output name="output" file="fasta_to_tabular_out4.tabular" /> + </test> - <test> - <param name="input" value="454.fasta" /> - <param name="descr_columns" value="5"/> - <param name="keep_first" value="10"/> - <output name="output" file="fasta_to_tabular_out6.tabular" /> - </test> + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="5"/> + <param name="keep_first" value="0"/> + <output name="output" file="fasta_to_tabular_out5.tabular" /> + </test> - </tests> - <help> - + <test> + <param name="input" value="454.fasta" /> + <param name="descr_columns" value="5"/> + <param name="keep_first" value="10"/> + <output name="output" file="fasta_to_tabular_out6.tabular" /> + </test> + + </tests> + <help><![CDATA[ + **What it does** This tool converts FASTA formatted sequences to TAB-delimited format. @@ -70,16 +73,16 @@ The option *How many characters to keep?* allows to select a specified number of letters from the beginning of each FASTA entry. With the introduction of the **How many columns to divide title string into?** option this setting is of limited use, but does still allow you to truncate the identifier. ------ +----- **Example** Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run:: - >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ + >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_ TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG - >EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ + >EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_ AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAA Running this tool with the default settings will produce this (2 column output): @@ -124,5 +127,5 @@ Note the sequences have been truncated for display purposes in the above tables. - </help> + ]]></help> </tool>