Mercurial > repos > peterjc > seq_length
changeset 2:6f29bb9960ac draft
v0.0.3 - Fixed SFF; more tests
author | peterjc |
---|---|
date | Mon, 14 May 2018 12:09:50 -0400 |
parents | 458f987918a6 |
children | fcdf11fb34de |
files | test-data/MID4_GLZRM4E04_rnd30.length.tabular test-data/MID4_GLZRM4E04_rnd30.sff tools/seq_length/README.rst tools/seq_length/seq_length.py tools/seq_length/seq_length.xml |
diffstat | 5 files changed, 85 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/MID4_GLZRM4E04_rnd30.length.tabular Mon May 14 12:09:50 2018 -0400 @@ -0,0 +1,31 @@ +#Identifier Length +GLZRM4E04IPVGX 97 +GLZRM4E04I9EJK 168 +GLZRM4E04JHW0A 386 +GLZRM4E04IBNFY 296 +GLZRM4E04IKCQP 447 +GLZRM4E04ITXFU 275 +GLZRM4E04JAY4Z 146 +GLZRM4E04H9M21 152 +GLZRM4E04IQDP5 350 +GLZRM4E04I0CYE 81 +GLZRM4E04I2NA7 97 +GLZRM4E04I5BWJ 248 +GLZRM4E04IK8WX 59 +GLZRM4E04I22QH 383 +GLZRM4E04IVVFA 81 +GLZRM4E04ILU3V 49 +GLZRM4E04IDVJT 320 +GLZRM4E04I3ZJ6 197 +GLZRM4E04I3UBT 288 +GLZRM4E04H59S1 362 +GLZRM4E04JFA38 345 +GLZRM4E04J4EK3 372 +GLZRM4E04IK96G 156 +GLZRM4E04JVL8Q 464 +GLZRM4E04IOQ36 389 +GLZRM4E04JBJJ1 264 +GLZRM4E04IEFNO 473 +GLZRM4E04JOT5I 186 +GLZRM4E04J4HNG 42 +GLZRM4E04JC544 331
--- a/tools/seq_length/README.rst Tue May 08 11:16:50 2018 -0400 +++ b/tools/seq_length/README.rst Mon May 14 12:09:50 2018 -0400 @@ -62,6 +62,9 @@ v0.0.1 - Initial version. v0.0.2 - Faster for FASTA and FASTQ. - Fixed typo. +v0.0.3 - Improved command line usage (outside of Galaxy). + - More tests (now covers SFF as well). + - Fix requesting SFF format. ======= ======================================================================
--- a/tools/seq_length/seq_length.py Tue May 08 11:16:50 2018 -0400 +++ b/tools/seq_length/seq_length.py Mon May 14 12:09:50 2018 -0400 @@ -20,10 +20,36 @@ from __future__ import print_function import sys +from optparse import OptionParser -if "-v" in sys.argv or "--version" in sys.argv: - print("v0.0.2") +usage = r"""Use as follows to compute all the lengths in a sequence file: + +$ python seq_length.py -i example.fasta -f fasta -o lengths.tsv +""" + +parser = OptionParser(usage=usage) +parser.add_option('-i', '--input', dest='input', + default=None, help='Input sequence filename (FASTA, FASTQ, etc)', + metavar="FILE") +parser.add_option('-f', '--format', dest='format', + default=None, help='Input sequence format (FASTA, QUAL, FASTQ, SFF)') +parser.add_option('-o', '--output', dest='output', + default=None, help='Output filename (tabular)', + metavar="FILE") +parser.add_option("-v", "--version", dest="version", + default=False, action="store_true", + help="Show version and quit") +options, args = parser.parse_args() + +if options.version: + print("v0.0.3") sys.exit(0) +if not options.input: + sys.exit("Require an input filename") +if not options.format: + sys.exit("Require the input format") +if not options.output: + sys.exit("Require an output filename") try: from Bio import SeqIO @@ -40,31 +66,25 @@ except ImportError: sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser") +in_file = options.input +out_file = options.output -# Parse Command Line -try: - in_file, seq_format, out_file = sys.argv[1:] -except ValueError: - sys.exit("Expected three arguments (input file, format, output file), " - "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) - - -if seq_format.startswith("fastq"): +if options.format.startswith("fastq"): # We don't care about the quality score encoding, just # need to translate Galaxy format name into something # Biopython will accept: format = "fastq" -elif seq_format.lower() == "csfasta": +elif options.format.lower() == "csfasta": # I have not tested with colour space FASTA format = "fasta" -elif seq_format.lower == "sff": +elif options.format.lower() == "sff": # The masked/trimmed numbers are more interesting format = "sff-trim" -elif seq_format.lower() in ["fasta", "qual"]: - format = seq_format.lower() +elif options.format.lower() in ["fasta", "qual"]: + format = options.format.lower() else: # TODO: Does Galaxy understand GenBank, EMBL, etc yet? - sys.exit("Unexpected format argument: %r" % seq_format) + sys.exit("Unexpected format argument: %r" % options.format) count = 0
--- a/tools/seq_length/seq_length.xml Tue May 08 11:16:50 2018 -0400 +++ b/tools/seq_length/seq_length.xml Mon May 14 12:09:50 2018 -0400 @@ -1,4 +1,4 @@ -<tool id="seq_length" name="Sequence lengths" version="0.0.2"> +<tool id="seq_length" name="Sequence lengths" version="0.0.3"> <description>from FASTA, QUAL, FASTQ, or SFF file</description> <requirements> <!-- This is the currently the last release of Biopython which is available via Galaxy's legacy XML packaging system --> @@ -8,7 +8,7 @@ python $__tool_directory__/seq_length.py --version </version_command> <command detect_errors="aggressive"> -python $__tool_directory__/seq_length.py '$input_file' '$input_file.ext' '$output_file' +python $__tool_directory__/seq_length.py -i '$input_file' -f '$input_file.ext' -o '$output_file' </command> <inputs> <param name="input_file" type="data" format="fasta,qual,fastq,sff" label="Sequence file" help="FASTA, QUAL, FASTQ, or SFF format." /> @@ -20,10 +20,23 @@ <test> <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> <output name="output_file" file="four_human_proteins.length.tabular" ftype="tabular" /> + <assert_stdout> + <has_line line="4 sequences, total length 3297" /> + </assert_stdout> </test> <test> <param name="input_file" value="SRR639755_sample_strict.fastq" ftype="fastq" /> <output name="output_file" file="SRR639755_sample_strict.length.tabular" ftype="tabular" /> + <assert_stdout> + <has_line line="2 sequences, total length 202" /> + </assert_stdout> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30.sff" ftype="sff" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30.length.tabular" ftype="tabular" /> + <assert_stdout> + <has_line line="30 sequences, total length 7504" /> + </assert_stdout> </test> </tests> <help>