Previous changeset 4:d51819d2d7e2 (2013-07-29) Next changeset 6:64e67f172188 (2013-11-21) |
Commit message:
Uploaded v0.0.5 dependant on Biopython 1.62 |
added:
tools/get_orfs_or_cdss/README.rst tools/get_orfs_or_cdss/get_orfs_or_cdss.py tools/get_orfs_or_cdss/get_orfs_or_cdss.xml tools/get_orfs_or_cdss/repository_dependencies.xml |
removed:
tools/filters/get_orfs_or_cdss.py tools/filters/get_orfs_or_cdss.rst tools/filters/get_orfs_or_cdss.xml tools/filters/repository_dependencies.xml |
b |
diff -r d51819d2d7e2 -r 5208c15805ec tools/filters/get_orfs_or_cdss.py --- a/tools/filters/get_orfs_or_cdss.py Mon Jul 29 09:30:44 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,223 +0,0 @@ -#!/usr/bin/env python -"""Find ORFs in a nucleotide sequence file. - -get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file - -Takes ten command line options, input sequence filename, format, genetic -code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), -minimum length (in amino acids), strand (both, forward, reverse), output -nucleotide filename, and output protein filename. - -This tool is a short Python script which requires Biopython. If you use -this tool in scientific work leading to a publication, please cite the -Biopython application note: - -Cock et al 2009. Biopython: freely available Python tools for computational -molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. - -This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute -(formerly SCRI), Dundee, UK. All rights reserved. - -See accompanying text file for licence details (MIT/BSD style). - -This is version 0.0.3 of the script. -""" -import sys -import re - -if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.3" - sys.exit(0) - -def stop_err(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - -try: - from Bio.Seq import Seq, reverse_complement, translate - from Bio.SeqRecord import SeqRecord - from Bio import SeqIO - from Bio.Data import CodonTable -except ImportError: - stop_err("Missing Biopython library") - -#Parse Command Line -try: - input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] -except ValueError: - stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) - -try: - table = int(table) -except ValueError: - stop_err("Expected integer for genetic code table, got %s" % table) - -try: - table_obj = CodonTable.ambiguous_generic_by_id[table] -except KeyError: - stop_err("Unknown codon table %i" % table) - -if ftype not in ["CDS", "ORF"]: - stop_err("Expected CDS or ORF, got %s" % ftype) - -if ends not in ["open", "closed"]: - stop_err("Expected open or closed for end treatment, got %s" % ends) - -try: - min_len = int(min_len) -except ValueError: - stop_err("Expected integer for min_len, got %s" % min_len) - -if seq_format.lower()=="sff": - seq_format = "sff-trim" -elif seq_format.lower()=="fasta": - seq_format = "fasta" -elif seq_format.lower().startswith("fastq"): - seq_format = "fastq" -else: - stop_err("Unsupported file type %r" % seq_format) - -print "Genetic code table %i" % table -print "Minimum length %i aa" % min_len -#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) - -starts = sorted(table_obj.start_codons) -assert "NNN" not in starts -re_starts = re.compile("|".join(starts)) - -stops = sorted(table_obj.stop_codons) -assert "NNN" not in stops -re_stops = re.compile("|".join(stops)) - -def start_chop_and_trans(s, strict=True): - """Returns offset, trimmed nuc, protein.""" - if strict: - assert s[-3:] in stops, s - assert len(s) % 3 == 0 - for match in re_starts.finditer(s): - #Must check the start is in frame - start = match.start() - if start % 3 == 0: - n = s[start:] - assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) - if strict: - t = translate(n, table, cds=True) - else: - #Use when missing stop codon, - t = "M" + translate(n[3:], table, to_stop=True) - return start, n, t - return None, None, None - -def break_up_frame(s): - """Returns offset, nuc, protein.""" - start = 0 - for match in re_stops.finditer(s): - index = match.start() + 3 - if index % 3 != 0: - continue - n = s[start:index] - if ftype=="CDS": - offset, n, t = start_chop_and_trans(n) - else: - offset = 0 - t = translate(n, table, to_stop=True) - if n and len(t) >= min_len: - yield start + offset, n, t - start = index - if ends == "open": - #No stop codon, Biopython's strict CDS translate will fail - n = s[start:] - #Ensure we have whole codons - #TODO - Try appending N instead? - #TODO - Do the next four lines more elegantly - if len(n) % 3: - n = n[:-1] - if len(n) % 3: - n = n[:-1] - if ftype=="CDS": - offset, n, t = start_chop_and_trans(n, strict=False) - else: - offset = 0 - t = translate(n, table, to_stop=True) - if n and len(t) >= min_len: - yield start + offset, n, t - - -def get_all_peptides(nuc_seq): - """Returns start, end, strand, nucleotides, protein. - - Co-ordinates are Python style zero-based. - """ - #TODO - Refactor to use a generator function (in start order) - #rather than making a list and sorting? - answer = [] - full_len = len(nuc_seq) - if strand != "reverse": - for frame in range(0,3): - for offset, n, t in break_up_frame(nuc_seq[frame:]): - start = frame + offset #zero based - answer.append((start, start + len(n), +1, n, t)) - if strand != "forward": - rc = reverse_complement(nuc_seq) - for frame in range(0,3) : - for offset, n, t in break_up_frame(rc[frame:]): - start = full_len - frame - offset #zero based - answer.append((start - len(n), start, -1, n ,t)) - answer.sort() - return answer - -def get_top_peptides(nuc_seq): - """Returns all peptides of max length.""" - values = list(get_all_peptides(nuc_seq)) - if not values: - raise StopIteration - max_len = max(len(x[-1]) for x in values) - for x in values: - if len(x[-1]) == max_len: - yield x - -def get_one_peptide(nuc_seq): - """Returns first (left most) peptide with max length.""" - values = list(get_top_peptides(nuc_seq)) - if not values: - raise StopIteration - yield values[0] - -if mode == "all": - get_peptides = get_all_peptides -elif mode == "top": - get_peptides = get_top_peptides -elif mode == "one": - get_peptides = get_one_peptide - -in_count = 0 -out_count = 0 -if out_nuc_file == "-": - out_nuc = sys.stdout -else: - out_nuc = open(out_nuc_file, "w") -if out_prot_file == "-": - out_prot = sys.stdout -else: - out_prot = open(out_prot_file, "w") -for record in SeqIO.parse(input_file, seq_format): - for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): - out_count += 1 - if f_strand == +1: - loc = "%i..%i" % (f_start+1, f_end) - else: - loc = "complement(%i..%i)" % (f_start+1, f_end) - descr = "length %i aa, %i bp, from %s of %s" \ - % (len(t), len(n), loc, record.description) - r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) - t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) - SeqIO.write(r, out_nuc, "fasta") - SeqIO.write(t, out_prot, "fasta") - in_count += 1 -if out_nuc is not sys.stdout: - out_nuc.close() -if out_prot is not sys.stdout: - out_prot.close() - -print "Found %i %ss in %i sequences" % (out_count, ftype, in_count) |
b |
diff -r d51819d2d7e2 -r 5208c15805ec tools/filters/get_orfs_or_cdss.rst --- a/tools/filters/get_orfs_or_cdss.rst Mon Jul 29 09:30:44 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,121 +0,0 @@ -Galaxy tool to find ORFs or simple CDSs -======================================= - -This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute -(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. -See the licence text below. - -This tool is a short Python script (using Biopython library functions) -to search nucleotide sequences for open reading frames (ORFs) or coding -sequences (CDSs) where the first potential start codon is used. See the -help text in the XML file for more information. - -This tool is available from the Galaxy Tool Shed at: - -* http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss - -See also the EMBOSS tool ``getorf`` which offers similar functionality and -has also been wrapped for use within Galaxy. - - -Automated Installation -====================== - -This should be straightforward using the Galaxy Tool Shed, which should be -able to automatically install the dependency on Biopython, and then install -this tool and run its unit tests. - - -Manual Installation -=================== - -There are just two files to install to use this tool from within Galaxy: - -* get_orfs_or_cdss.py (the Python script) -* get_orfs_or_cdss.xml (the Galaxy tool definition) - -If you are installing this manually (rather than via the Tool Shed), the -suggested location is in the Galaxy folder tools/filters next to the tool -for calling sff_extract.py for converting SFF to FASTQ or FASTA + QUAL. -You will also need to modify the tools_conf.xml file to tell Galaxy to offer the -tool. One suggested location is in the filters section. Simply add the line:: - - <tool file="filters/get_orfs_or_cdss.xml" /> - -You will also need to install Biopython 1.54 or later. If you want to run -the unit tests, include this line in tools_conf.xml.sample and the sample -FASTA files under the test-data directory. Then:: - - ./run_functional_tests.sh -id get_orfs_or_cdss - -That's it. - - -History -======= - -======= ====================================================================== -Version Changes -------- ---------------------------------------------------------------------- -v0.0.1 - Initial version. -v0.0.2 - Correct labelling issue on reverse strand. - - Use the new <stdio> settings in the XML wrappers to catch errors -v0.0.3 - Include unit tests. - - Record Python script version when run from Galaxy. -v0.0.4 - Link to Tool Shed added to help text and this documentation. -v0.0.5 - Automated intallation of the Biopython dependency. - - Use reStructuredText for this README file. - - Adopt standard MIT License. -======= ====================================================================== - - -Developers -========== - -This script and related tools are being developed on the following hg branch: -http://bitbucket.org/peterjc/galaxy-central/src/tools - -For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use -the following command from the Galaxy root folder:: - - $ tar -czf get_orfs_or_cdss.tar.gz tools/filters/get_orfs_or_cdss.* tools/filters/repository_dependencies.xml test-data/get_orf_input*.fasta test-data/Ssuis.fasta - -Check this worked:: - - $ tar -tzf get_orfs_or_cdss.tar.gz - filter/get_orfs_or_cdss.py - filter/get_orfs_or_cdss.rst - filter/get_orfs_or_cdss.xml - tools/filters/repository_dependencies.xml - test-data/get_orf_input.fasta - test-data/get_orf_input.Suis_ORF.nuc.fasta - test-data/get_orf_input.Suis_ORF.prot.fasta - test-data/get_orf_input.t11_nuc_out.fasta - test-data/get_orf_input.t11_open_nuc_out.fasta - test-data/get_orf_input.t11_open_prot_out.fasta - test-data/get_orf_input.t11_prot_out.fasta - test-data/get_orf_input.t1_nuc_out.fasta - test-data/get_orf_input.t1_prot_out.fasta - test-data/Ssuis.fasta - - -Licence (MIT) -============= - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. |
b |
diff -r d51819d2d7e2 -r 5208c15805ec tools/filters/get_orfs_or_cdss.xml --- a/tools/filters/get_orfs_or_cdss.xml Mon Jul 29 09:30:44 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,166 +0,0 @@\n-<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.0.5">\n-\t<description>e.g. to get peptides from ESTs</description>\n-\t<version_command interpreter="python">get_orfs_or_cdss.py --version</version_command>\n-\t<command interpreter="python">\n-get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file\n-\t</command>\n-\t<stdio>\n-\t\t<!-- Anything other than zero is an error -->\n-\t\t<exit_code range="1:" />\n-\t\t<exit_code range=":-1" />\n-\t</stdio>\n-\t<inputs>\n-\t\t<param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." />\n-\t\t<param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons">\n-\t\t\t<option value="1">1. Standard</option>\n-\t\t\t<option value="2">2. Vertebrate Mitochondrial</option>\n-\t\t\t<option value="3">3. Yeast Mitochondrial</option>\n-\t\t\t<option value="4">4. Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>\n-\t\t\t<option value="5">5. Invertebrate Mitochondrial</option>\n-\t\t\t<option value="6">6. Ciliate Macronuclear and Dasycladacean</option>\n-\t\t\t<option value="9">9. Echinoderm Mitochondrial</option>\n-\t\t\t<option value="10">10. Euplotid Nuclear</option>\n-\t\t\t<option value="11">11. Bacterial</option>\n-\t\t\t<option value="12">12. Alternative Yeast Nuclear</option>\n-\t\t\t<option value="13">13. Ascidian Mitochondrial</option>\n-\t\t\t<option value="14">14. Flatworm Mitochondrial</option>\n-\t\t\t<option value="15">15. Blepharisma Macronuclear</option>\n-\t\t\t<option value="16">16. Chlorophycean Mitochondrial</option>\n-\t\t\t<option value="21">21. Trematode Mitochondrial</option>\n-\t\t\t<option value="22">22. Scenedesmus obliquus</option>\n-\t\t\t<option value="23">23. Thraustochytrium Mitochondrial</option>\n-\t\t</param>\n-\t\t<param name="ftype" type="select" value="True" label="Look for ORFs or CDSs">\n- <option value="ORF">Look for ORFs (check for stop codons only, ignore start codons)</option>\n- <option value="CDS">Look for CDSs (with start and stop codons)</option>\n-\t\t</param>\n- <param name="ends" type="select" value="open" label="Sequence end treatment">\n-\t\t\t<option value="open">Open ended (will allow missing start/stop codons at the ends)</option>\n- <option value="closed">Complete (will check for start/stop codons at the ends)</option>\n- <!-- TODO? Circular, for using this on finished bacteria etc -->\n- </param>\n-\n-\t\t<param name="mode" type="select" label="Selection criteria" help="Suppose a sequence has ORFs/CDSs of lengths 100, 102 and 102 -- which should be taken? These options would return 3, 2 or 1 ORF.">\n- <option value="all">All ORFs/CDSs from each sequence</option>\n- <option value="top">All ORFs/CDSs from each sequence with the maximum length</option>\n- <option value="one">First ORF/CDS from each sequence with the maximum length</option>\n-\t\t</param>\n- <param name="min_len" type="integer" size="5" value="30" label="Minimum length ORF/CDS (in amino acids, e.g. 30 aa = 90 bp plus any stop codon)">\n- </param>\n- <param name="strand" type="select" label="Strand to search" help="Use the forward only option if your sequence directionality is known (e.g. from poly-A tails, or strand specific RNA sequencing.">\n- <option value="both">Search both the forward and reverse strand</option>\n- <option value="forward">Only search the forward strand</option>\n- <option value="reverse">Only search the reverse strand</option>\n- </param>\n-\t</inputs>\n-\t<outputs>\n-\t\t<data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" />\n-\t\t<data name="out_prot_file" format="fasta" label="'..b'input.t11_prot_out.fasta" />\n-\t\t</test>\n-\t\t<test>\n- <param name="input_file" value="get_orf_input.fasta" />\n- <param name="table" value="11" />\n- <param name="ftype" value="CDS" />\n- <param name="ends" value="open" />\n- <param name="mode" value="all" />\n- <param name="min_len" value="10" />\n- <param name="strand" value="forward" />\n- <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" />\n- <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" />\n-\t\t</test>\n- <test>\n-\t\t\t<param name="input_file" value="Ssuis.fasta" />\n-\t\t\t<param name="table" value="11" />\n-\t\t\t<param name="ftype" value="ORF" />\n-\t\t\t<param name="ends" value="open" />\n-\t\t\t<param name="mode" value="all" />\n-\t\t\t<param name="min_len" value="100" />\n-\t\t\t<param name="strand" value="both" />\n-\t\t\t<output name="out_nuc_file" file="get_orf_input.Suis_ORF.nuc.fasta" />\n-\t\t\t<output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.fasta" />\n-\t\t</test>\n-\t</tests>\n-\t<requirements>\n-\t\t<requirement type="python-module">Bio</requirement>\n-\t</requirements>\n-\t<help>\n-\n-**What it does**\n-\n-Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ\n-and Standard Flowgram Format (SFF) are supported), and searches each sequence\n-for open reading frames (ORFs) or potential coding sequences (CDSs) of the\n-given minimum length. These are returned as FASTA files of nucleotides and\n-protein sequences.\n-\n-You can choose to have all the ORFs/CDSs above the minimum length for each\n-sequence (similar to the EMBOSS getorf tool), those with the longest length\n-equal, or the first ORF/CDS with the longest length (in the special case\n-where a sequence encodes two or more long ORFs/CDSs of the same length). The\n-last option is a reasonable choice when the input sequences represent EST or\n-mRNA sequences, where only one ORF/CDS is expected.\n-\n-Note that if no ORFs/CDSs in a sequence match the criteria, there will be no\n-output for that sequence.\n-\n-Also note that the ORFs/CDSs are assigned modified identifiers to distinguish\n-them from the original full length sequences, by appending a suffix.\n-\n-The start and stop codons are taken from the `NCBI Genetic Codes\n-<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>`_.\n-When searching for ORFs, the sequences will run from stop codon to stop\n-codon, and any start codons are ignored. When searching for CDSs, the first\n-potential start codon will be used, giving the longest possible CDS within\n-each ORF, and thus the longest possible protein sequence. This is useful\n-for things like BLAST or domain searching, but since this may not be the\n-correct start codon may not be appropriate for signal peptide detection\n-etc.\n-\n-**Example Usage**\n-\n-Given some EST sequences (Sanger capillary reads) assembled into unigenes,\n-or a transcriptome assembly from some RNA-Seq, each of your nucleotide\n-sequences should (barring sequencing, assembly errors, frame-shifts etc)\n-encode one protein as a single ORF/CDS, which you wish to extract (and\n-perhaps translate into amino acids).\n-\n-If your RNS-Seq data was strand specific, and assembled taking this into\n-account, you should only search for ORFs/CDSs on the forward strand.\n-\n-**Citation**\n-\n-This tool uses Biopython. If you use this tool in scientific work leading\n-to a publication, please cite the Biopython application note (and Galaxy\n-too of course):\n-\n-Cock et al 2009. Biopython: freely available Python tools for computational\n-molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.\n-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.\n-\n-This tool is available to install into other Galaxy Instances via the Galaxy\n-Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss\n-\t</help>\n-</tool>\n' |
b |
diff -r d51819d2d7e2 -r 5208c15805ec tools/filters/repository_dependencies.xml --- a/tools/filters/repository_dependencies.xml Mon Jul 29 09:30:44 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<repositories description="This requires Biopython as a dependency."> -<!-- Leave out the tool shed and revision to get the current - tool shed and latest revision at the time of upload --> -<repository changeset_revision="5d0c54f7fea2" name="package_biopython_1_61" owner="biopython" toolshed="http://toolshed.g2.bx.psu.edu" /> -</repositories> |
b |
diff -r d51819d2d7e2 -r 5208c15805ec tools/get_orfs_or_cdss/README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/get_orfs_or_cdss/README.rst Mon Oct 28 05:19:38 2013 -0400 |
b |
@@ -0,0 +1,125 @@ +Galaxy tool to find ORFs or simple CDSs +======================================= + +This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below (MIT licence). + +This tool is a short Python script (using Biopython library functions) +to search nucleotide sequences for open reading frames (ORFs) or coding +sequences (CDSs) where the first potential start codon is used. See the +help text in the XML file for more information. + +This tool is available from the Galaxy Tool Shed at: + +* http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss + +See also the EMBOSS tool ``getorf`` which offers similar functionality and +has also been wrapped for use within Galaxy. + + +Automated Installation +====================== + +This should be straightforward using the Galaxy Tool Shed, which should be +able to automatically install the dependency on Biopython, and then install +this tool and run its unit tests. + + +Manual Installation +=================== + +There are just two files to install to use this tool from within Galaxy: + +* get_orfs_or_cdss.py (the Python script) +* get_orfs_or_cdss.xml (the Galaxy tool definition) + +The suggested location is in a dedicated tools/get_orfs_or_cdss folder. + +You will also need to modify the tools_conf.xml file to tell Galaxy to offer the +tool. One suggested location is in the filters section. Simply add the line:: + + <tool file="get_orfs_or_cdss/get_orfs_or_cdss.xml" /> + +You will also need to install Biopython 1.54 or later. If you want to run +the unit tests, include this line in tools_conf.xml.sample and the sample +FASTA files under the test-data directory. Then:: + + ./run_functional_tests.sh -id get_orfs_or_cdss + +That's it. + + +History +======= + +======= ====================================================================== +Version Changes +------- ---------------------------------------------------------------------- +v0.0.1 - Initial version. +v0.0.2 - Correct labelling issue on reverse strand. + - Use the new <stdio> settings in the XML wrappers to catch errors +v0.0.3 - Include unit tests. + - Record Python script version when run from Galaxy. +v0.0.4 - Link to Tool Shed added to help text and this documentation. +v0.0.5 - Automated intallation of the Biopython dependency. + - Use reStructuredText for this README file. + - Adopt standard MIT License. + - Updated citation information (Cock et al. 2013). + - Renamed folder and adopted README.rst naming. +======= ====================================================================== + + +Developers +========== + +This script and related tools were initially developed on the following hg branch: +http://bitbucket.org/peterjc/galaxy-central/src/tools + +Development has now moved to a dedicated GitHub repository: +https://github.com/peterjc/pico_galaxy/tree/master/tools + +For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder:: + + $ tar -czf get_orfs_or_cdss.tar.gz tools/get_orfs_or_cdss/README.rst tools/get_orfs_or_cdss/get_orfs_or_cdss.* tools/get_orfs_or_cdss/repository_dependencies.xml test-data/get_orf_input*.fasta test-data/Ssuis.fasta + +Check this worked:: + + $ tar -tzf get_orfs_or_cdss.tar.gz + tools/get_orfs_or_cdss/README.rst + tools/get_orfs_or_cdss/get_orfs_or_cdss.py + tools/get_orfs_or_cdss/get_orfs_or_cdss.xml + tools/get_orfs_or_cdss/repository_dependencies.xml + test-data/get_orf_input.fasta + test-data/get_orf_input.Suis_ORF.nuc.fasta + test-data/get_orf_input.Suis_ORF.prot.fasta + test-data/get_orf_input.t11_nuc_out.fasta + test-data/get_orf_input.t11_open_nuc_out.fasta + test-data/get_orf_input.t11_open_prot_out.fasta + test-data/get_orf_input.t11_prot_out.fasta + test-data/get_orf_input.t1_nuc_out.fasta + test-data/get_orf_input.t1_prot_out.fasta + test-data/Ssuis.fasta + + +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. |
b |
diff -r d51819d2d7e2 -r 5208c15805ec tools/get_orfs_or_cdss/get_orfs_or_cdss.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Mon Oct 28 05:19:38 2013 -0400 |
[ |
@@ -0,0 +1,223 @@ +#!/usr/bin/env python +"""Find ORFs in a nucleotide sequence file. + +get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file + +Takes ten command line options, input sequence filename, format, genetic +code, CDS vs ORF, end type (open, closed), selection mode (all, top, one), +minimum length (in amino acids), strand (both, forward, reverse), output +nucleotide filename, and output protein filename. + +This tool is a short Python script which requires Biopython. If you use +this tool in scientific work leading to a publication, please cite the +Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute +(formerly SCRI), Dundee, UK. All rights reserved. + +See accompanying text file for licence details (MIT/BSD style). + +This is version 0.0.3 of the script. +""" +import sys +import re + +if "-v" in sys.argv or "--version" in sys.argv: + print "v0.0.3" + sys.exit(0) + +def stop_err(msg, err=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(err) + +try: + from Bio.Seq import Seq, reverse_complement, translate + from Bio.SeqRecord import SeqRecord + from Bio import SeqIO + from Bio.Data import CodonTable +except ImportError: + stop_err("Missing Biopython library") + +#Parse Command Line +try: + input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:] +except ValueError: + stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) + +try: + table = int(table) +except ValueError: + stop_err("Expected integer for genetic code table, got %s" % table) + +try: + table_obj = CodonTable.ambiguous_generic_by_id[table] +except KeyError: + stop_err("Unknown codon table %i" % table) + +if ftype not in ["CDS", "ORF"]: + stop_err("Expected CDS or ORF, got %s" % ftype) + +if ends not in ["open", "closed"]: + stop_err("Expected open or closed for end treatment, got %s" % ends) + +try: + min_len = int(min_len) +except ValueError: + stop_err("Expected integer for min_len, got %s" % min_len) + +if seq_format.lower()=="sff": + seq_format = "sff-trim" +elif seq_format.lower()=="fasta": + seq_format = "fasta" +elif seq_format.lower().startswith("fastq"): + seq_format = "fastq" +else: + stop_err("Unsupported file type %r" % seq_format) + +print "Genetic code table %i" % table +print "Minimum length %i aa" % min_len +#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand) + +starts = sorted(table_obj.start_codons) +assert "NNN" not in starts +re_starts = re.compile("|".join(starts)) + +stops = sorted(table_obj.stop_codons) +assert "NNN" not in stops +re_stops = re.compile("|".join(stops)) + +def start_chop_and_trans(s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in stops, s + assert len(s) % 3 == 0 + for match in re_starts.finditer(s): + #Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, table, cds=True) + else: + #Use when missing stop codon, + t = "M" + translate(n[3:], table, to_stop=True) + return start, n, t + return None, None, None + +def break_up_frame(s): + """Returns offset, nuc, protein.""" + start = 0 + for match in re_stops.finditer(s): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + if ftype=="CDS": + offset, n, t = start_chop_and_trans(n) + else: + offset = 0 + t = translate(n, table, to_stop=True) + if n and len(t) >= min_len: + yield start + offset, n, t + start = index + if ends == "open": + #No stop codon, Biopython's strict CDS translate will fail + n = s[start:] + #Ensure we have whole codons + #TODO - Try appending N instead? + #TODO - Do the next four lines more elegantly + if len(n) % 3: + n = n[:-1] + if len(n) % 3: + n = n[:-1] + if ftype=="CDS": + offset, n, t = start_chop_and_trans(n, strict=False) + else: + offset = 0 + t = translate(n, table, to_stop=True) + if n and len(t) >= min_len: + yield start + offset, n, t + + +def get_all_peptides(nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + #TODO - Refactor to use a generator function (in start order) + #rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + if strand != "reverse": + for frame in range(0,3): + for offset, n, t in break_up_frame(nuc_seq[frame:]): + start = frame + offset #zero based + answer.append((start, start + len(n), +1, n, t)) + if strand != "forward": + rc = reverse_complement(nuc_seq) + for frame in range(0,3) : + for offset, n, t in break_up_frame(rc[frame:]): + start = full_len - frame - offset #zero based + answer.append((start - len(n), start, -1, n ,t)) + answer.sort() + return answer + +def get_top_peptides(nuc_seq): + """Returns all peptides of max length.""" + values = list(get_all_peptides(nuc_seq)) + if not values: + raise StopIteration + max_len = max(len(x[-1]) for x in values) + for x in values: + if len(x[-1]) == max_len: + yield x + +def get_one_peptide(nuc_seq): + """Returns first (left most) peptide with max length.""" + values = list(get_top_peptides(nuc_seq)) + if not values: + raise StopIteration + yield values[0] + +if mode == "all": + get_peptides = get_all_peptides +elif mode == "top": + get_peptides = get_top_peptides +elif mode == "one": + get_peptides = get_one_peptide + +in_count = 0 +out_count = 0 +if out_nuc_file == "-": + out_nuc = sys.stdout +else: + out_nuc = open(out_nuc_file, "w") +if out_prot_file == "-": + out_prot = sys.stdout +else: + out_prot = open(out_prot_file, "w") +for record in SeqIO.parse(input_file, seq_format): + for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())): + out_count += 1 + if f_strand == +1: + loc = "%i..%i" % (f_start+1, f_end) + else: + loc = "complement(%i..%i)" % (f_start+1, f_end) + descr = "length %i aa, %i bp, from %s of %s" \ + % (len(t), len(n), loc, record.description) + r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) + t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr) + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + in_count += 1 +if out_nuc is not sys.stdout: + out_nuc.close() +if out_prot is not sys.stdout: + out_prot.close() + +print "Found %i %ss in %i sequences" % (out_count, ftype, in_count) |
b |
diff -r d51819d2d7e2 -r 5208c15805ec tools/get_orfs_or_cdss/get_orfs_or_cdss.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml Mon Oct 28 05:19:38 2013 -0400 |
b |
b'@@ -0,0 +1,171 @@\n+<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.0.5">\n+ <description>e.g. to get peptides from ESTs</description>\n+ <requirements>\n+ <requirement type="package" version="1.62">biopython</requirement>\n+ <requirement type="python-module">Bio</requirement>\n+ </requirements>\n+ <version_command interpreter="python">get_orfs_or_cdss.py --version</version_command>\n+ <command interpreter="python">\n+get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file\n+ </command>\n+ <stdio>\n+ <!-- Anything other than zero is an error -->\n+ <exit_code range="1:" />\n+ <exit_code range=":-1" />\n+ </stdio>\n+ <inputs>\n+ <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." />\n+ <param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons">\n+ <option value="1">1. Standard</option>\n+ <option value="2">2. Vertebrate Mitochondrial</option>\n+ <option value="3">3. Yeast Mitochondrial</option>\n+ <option value="4">4. Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>\n+ <option value="5">5. Invertebrate Mitochondrial</option>\n+ <option value="6">6. Ciliate Macronuclear and Dasycladacean</option>\n+ <option value="9">9. Echinoderm Mitochondrial</option>\n+ <option value="10">10. Euplotid Nuclear</option>\n+ <option value="11">11. Bacterial</option>\n+ <option value="12">12. Alternative Yeast Nuclear</option>\n+ <option value="13">13. Ascidian Mitochondrial</option>\n+ <option value="14">14. Flatworm Mitochondrial</option>\n+ <option value="15">15. Blepharisma Macronuclear</option>\n+ <option value="16">16. Chlorophycean Mitochondrial</option>\n+ <option value="21">21. Trematode Mitochondrial</option>\n+ <option value="22">22. Scenedesmus obliquus</option>\n+ <option value="23">23. Thraustochytrium Mitochondrial</option>\n+ </param>\n+ <param name="ftype" type="select" value="True" label="Look for ORFs or CDSs">\n+ <option value="ORF">Look for ORFs (check for stop codons only, ignore start codons)</option>\n+ <option value="CDS">Look for CDSs (with start and stop codons)</option>\n+ </param>\n+ <param name="ends" type="select" value="open" label="Sequence end treatment">\n+ <option value="open">Open ended (will allow missing start/stop codons at the ends)</option>\n+ <option value="closed">Complete (will check for start/stop codons at the ends)</option>\n+ <!-- TODO? Circular, for using this on finished bacteria etc -->\n+ </param>\n+ <param name="mode" type="select" label="Selection criteria" help="Suppose a sequence has ORFs/CDSs of lengths 100, 102 and 102 -- which should be taken? These options would return 3, 2 or 1 ORF.">\n+ <option value="all">All ORFs/CDSs from each sequence</option>\n+ <option value="top">All ORFs/CDSs from each sequence with the maximum length</option>\n+ <option value="one">First ORF/CDS from each sequence with the maximum length</option>\n+ </param>\n+ <param name="min_len" type="integer" size="5" value="30" label="Minimum length ORF/CDS (in amino acids, e.g. 30 aa = 90 bp plus any stop codon)" />\n+ <param name="strand" type="select" label="Strand to search" help="Use the forward only option if your sequence directionality is known (e.g. from poly-A tails, or strand specific RNA sequencing.">\n+ <option value="both">Search both the forward and reverse strand</option>\n+ <option value="forward">Only search the forward strand</optio'..b'ype" value="CDS" />\n+ <param name="ends" value="open" />\n+ <param name="mode" value="all" />\n+ <param name="min_len" value="10" />\n+ <param name="strand" value="forward" />\n+ <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" />\n+ <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" />\n+ </test>\n+ <test>\n+ <param name="input_file" value="Ssuis.fasta" />\n+ <param name="table" value="11" />\n+ <param name="ftype" value="ORF" />\n+ <param name="ends" value="open" />\n+ <param name="mode" value="all" />\n+ <param name="min_len" value="100" />\n+ <param name="strand" value="both" />\n+ <output name="out_nuc_file" file="get_orf_input.Suis_ORF.nuc.fasta" />\n+ <output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.fasta" />\n+ </test>\n+ </tests>\n+ <help>\n+**What it does**\n+\n+Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ\n+and Standard Flowgram Format (SFF) are supported), and searches each sequence\n+for open reading frames (ORFs) or potential coding sequences (CDSs) of the\n+given minimum length. These are returned as FASTA files of nucleotides and\n+protein sequences.\n+\n+You can choose to have all the ORFs/CDSs above the minimum length for each\n+sequence (similar to the EMBOSS getorf tool), those with the longest length\n+equal, or the first ORF/CDS with the longest length (in the special case\n+where a sequence encodes two or more long ORFs/CDSs of the same length). The\n+last option is a reasonable choice when the input sequences represent EST or\n+mRNA sequences, where only one ORF/CDS is expected.\n+\n+Note that if no ORFs/CDSs in a sequence match the criteria, there will be no\n+output for that sequence.\n+\n+Also note that the ORFs/CDSs are assigned modified identifiers to distinguish\n+them from the original full length sequences, by appending a suffix.\n+\n+The start and stop codons are taken from the `NCBI Genetic Codes\n+<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>`_.\n+When searching for ORFs, the sequences will run from stop codon to stop\n+codon, and any start codons are ignored. When searching for CDSs, the first\n+potential start codon will be used, giving the longest possible CDS within\n+each ORF, and thus the longest possible protein sequence. This is useful\n+for things like BLAST or domain searching, but since this may not be the\n+correct start codon may not be appropriate for signal peptide detection\n+etc.\n+\n+**Example Usage**\n+\n+Given some EST sequences (Sanger capillary reads) assembled into unigenes,\n+or a transcriptome assembly from some RNA-Seq, each of your nucleotide\n+sequences should (barring sequencing, assembly errors, frame-shifts etc)\n+encode one protein as a single ORF/CDS, which you wish to extract (and\n+perhaps translate into amino acids).\n+\n+If your RNS-Seq data was strand specific, and assembled taking this into\n+account, you should only search for ORFs/CDSs on the forward strand.\n+\n+**Citation**\n+\n+If you use this Galaxy tool in work leading to a scientific publication please\n+cite the following paper:\n+\n+Peter J.A. Cock, Bj\xc3\xb6rn A. Gr\xc3\xbcning, Konrad Paszkiewicz and Leighton Pritchard (2013).\n+Galaxy tools and workflows for sequence analysis with applications\n+in molecular plant pathology. PeerJ 1:e167\n+http://dx.doi.org/10.7717/peerj.167\n+\n+This tool uses Biopython, so you may also wish to cite the Biopython\n+application note (and Galaxy too of course):\n+\n+Cock et al (2009). Biopython: freely available Python tools for computational\n+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.\n+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.\n+\n+This tool is available to install into other Galaxy Instances via the Galaxy\n+Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss\n+ </help>\n+</tool>\n' |
b |
diff -r d51819d2d7e2 -r 5208c15805ec tools/get_orfs_or_cdss/repository_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/get_orfs_or_cdss/repository_dependencies.xml Mon Oct 28 05:19:38 2013 -0400 |
b |
@@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<repositories description="This requires Biopython as a dependency."> +<!-- Leave out the tool shed and revision to get the current + tool shed and latest revision at the time of upload --> +<repository changeset_revision="3e82cbc44886" name="package_biopython_1_62" owner="biopython" toolshed="http://toolshed.g2.bx.psu.edu" /> +</repositories> |