Mercurial > repos > peterjc > get_orfs_or_cdss

--- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.py	Wed May 30 08:33:20 2018 -0400
+++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.py	Fri Apr 16 22:37:04 2021 +0000
@@ -10,7 +10,7 @@

 Cock et al 2009. Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
 (formerly SCRI), Dundee, UK. All rights reserved.
@@ -42,43 +42,100 @@


 parser = OptionParser(usage=usage)
-parser.add_option('-i', '--input', dest='input_file',
-                  default=None, help='Input fasta file',
-                  metavar='FILE')
-parser.add_option('-f', '--format', dest='seq_format',
-                  default='fasta', help='Sequence format (e.g. fasta, fastq, sff)')
-parser.add_option('--table', dest='table',
-                  default=1, help='NCBI Translation table', type='int')
-parser.add_option('-t', '--ftype', dest='ftype', type='choice',
-                  choices=['CDS', 'ORF'], default='ORF',
-                  help='Find ORF or CDSs')
-parser.add_option('-e', '--ends', dest='ends', type='choice',
-                  choices=['open', 'closed'], default='closed',
-                  help='Open or closed. Closed ensures start/stop codons are present')
-parser.add_option('-m', '--mode', dest='mode', type='choice',
-                  choices=['all', 'top', 'one'], default='all',
-                  help='Output all ORFs/CDSs from sequence, all ORFs/CDSs '
-                  'with max length, or first with maximum length')
-parser.add_option('--min_len', dest='min_len',
-                  default=10, help='Minimum ORF/CDS length', type='int')
-parser.add_option('-s', '--strand', dest='strand', type='choice',
-                  choices=['forward', 'reverse', 'both'], default='both',
-                  help='Strand to search for features on')
-parser.add_option('--on', dest='out_nuc_file',
-                  default=None, help='Output nucleotide sequences, or - for STDOUT',
-                  metavar='FILE')
-parser.add_option('--op', dest='out_prot_file',
-                  default=None, help='Output protein sequences, or - for STDOUT',
-                  metavar='FILE')
-parser.add_option('--ob', dest='out_bed_file',
-                  default=None, help='Output BED file, or - for STDOUT',
-                  metavar='FILE')
-parser.add_option('--og', dest='out_gff3_file',
-                  default=None, help='Output GFF3 file, or - for STDOUT',
-                  metavar='FILE')
-parser.add_option('-v', '--version', dest='version',
-                  default=False, action='store_true',
-                  help='Show version and quit')
+parser.add_option(
+    "-i",
+    "--input",
+    dest="input_file",
+    default=None,
+    help="Input fasta file",
+    metavar="FILE",
+)
+parser.add_option(
+    "-f",
+    "--format",
+    dest="seq_format",
+    default="fasta",
+    help="Sequence format (e.g. fasta, fastq, sff)",
+)
+parser.add_option(
+    "--table", dest="table", default=1, help="NCBI Translation table", type="int"
+)
+parser.add_option(
+    "-t",
+    "--ftype",
+    dest="ftype",
+    type="choice",
+    choices=["CDS", "ORF"],
+    default="ORF",
+    help="Find ORF or CDSs",
+)
+parser.add_option(
+    "-e",
+    "--ends",
+    dest="ends",
+    type="choice",
+    choices=["open", "closed"],
+    default="closed",
+    help="Open or closed. Closed ensures start/stop codons are present",
+)
+parser.add_option(
+    "-m",
+    "--mode",
+    dest="mode",
+    type="choice",
+    choices=["all", "top", "one"],
+    default="all",
+    help="Output all ORFs/CDSs from sequence, all ORFs/CDSs "
+    "with max length, or first with maximum length",
+)
+parser.add_option(
+    "--min_len", dest="min_len", default=10, help="Minimum ORF/CDS length", type="int"
+)
+parser.add_option(
+    "-s",
+    "--strand",
+    dest="strand",
+    type="choice",
+    choices=["forward", "reverse", "both"],
+    default="both",
+    help="Strand to search for features on",
+)
+parser.add_option(
+    "--on",
+    dest="out_nuc_file",
+    default=None,
+    help="Output nucleotide sequences, or - for STDOUT",
+    metavar="FILE",
+)
+parser.add_option(
+    "--op",
+    dest="out_prot_file",
+    default=None,
+    help="Output protein sequences, or - for STDOUT",
+    metavar="FILE",
+)
+parser.add_option(
+    "--ob",
+    dest="out_bed_file",
+    default=None,
+    help="Output BED file, or - for STDOUT",
+    metavar="FILE",
+)
+parser.add_option(
+    "--og",
+    dest="out_gff3_file",
+    default=None,
+    help="Output GFF3 file, or - for STDOUT",
+    metavar="FILE",
+)
+parser.add_option(
+    "-v",
+    "--version",
+    dest="version",
+    default=False,
+    action="store_true",
+    help="Show version and quit",
+)

 options, args = parser.parse_args()

@@ -89,7 +146,14 @@
 if not options.input_file:
     sys.exit("Input file is required")

-if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)):
+if not any(
+    (
+        options.out_nuc_file,
+        options.out_prot_file,
+        options.out_bed_file,
+        options.out_gff3_file,
+    )
+):
     sys.exit("At least one output file is required")

 try:
@@ -120,7 +184,7 @@


 def start_chop_and_trans(s, strict=True):
-    """Returns offset, trimmed nuc, protein."""
+    """Return offset, trimmed nuc, protein."""
     if strict:
         assert s[-3:] in stops, s
     assert len(s) % 3 == 0
@@ -140,7 +204,7 @@


 def break_up_frame(s):
-    """Returns offset, nuc, protein."""
+    """Return offset, nuc, protein."""
     start = 0
     for match in re_stops.finditer(s):
         index = match.start() + 3
@@ -175,7 +239,7 @@


 def get_all_peptides(nuc_seq):
-    """Returns start, end, strand, nucleotides, protein.
+    """Return start, end, strand, nucleotides, protein.

     Co-ordinates are Python style zero-based.
     """
@@ -199,7 +263,7 @@


 def get_top_peptides(nuc_seq):
-    """Returns all peptides of max length."""
+    """Return all peptides of max length."""
     values = list(get_all_peptides(nuc_seq))
     if not values:
         raise StopIteration
@@ -210,7 +274,7 @@


 def get_one_peptide(nuc_seq):
-    """Returns first (left most) peptide with max length."""
+    """Return first (left most) peptide with max length."""
     values = list(get_top_peptides(nuc_seq))
     if not values:
         raise StopIteration
@@ -255,17 +319,23 @@
     out_gff3 = None

 if out_gff3:
-    out_gff3.write('##gff-version 3\n')
+    out_gff3.write("##gff-version 3\n")

 for record in SeqIO.parse(options.input_file, seq_format):
-    for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
+    for i, (f_start, f_end, f_strand, n, t) in enumerate(
+        get_peptides(str(record.seq).upper())
+    ):
         out_count += 1
         if f_strand == +1:
             loc = "%i..%i" % (f_start + 1, f_end)
         else:
             loc = "complement(%i..%i)" % (f_start + 1, f_end)
-        descr = "length %i aa, %i bp, from %s of %s" \
-                % (len(t), len(n), loc, record.description)
+        descr = "length %i aa, %i bp, from %s of %s" % (
+            len(t),
+            len(n),
+            loc,
+            record.description,
+        )
         fid = record.id + "|%s%i" % (options.ftype, i + 1)
         r = SeqRecord(Seq(n), id=fid, name="", description=descr)
         t = SeqRecord(Seq(t), id=fid, name="", description=descr)
@@ -273,12 +343,32 @@
             SeqIO.write(r, out_nuc, "fasta")
         if out_prot:
             SeqIO.write(t, out_prot, "fasta")
-        nice_strand = '+' if f_strand == +1 else '-'
+        nice_strand = "+" if f_strand == +1 else "-"
         if out_bed:
-            out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n')
+            out_bed.write(
+                "\t".join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand]))
+                + "\n"
+            )
         if out_gff3:
-            out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.',
-                                               nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n')
+            out_gff3.write(
+                "\t".join(
+                    map(
+                        str,
+                        [
+                            record.id,
+                            "getOrfsOrCds",
+                            "CDS",
+                            f_start + 1,
+                            f_end,
+                            ".",
+                            nice_strand,
+                            0,
+                            "ID=%s%s" % (options.ftype, i + 1),
+                        ],
+                    )
+                )
+                + "\n"
+            )
     in_count += 1
 if out_nuc and out_nuc is not sys.stdout:
     out_nuc.close()
--- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml	Wed May 30 08:33:20 2018 -0400
+++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml	Fri Apr 16 22:37:04 2021 +0000
@@ -184,14 +184,14 @@
 Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
 Galaxy tools and workflows for sequence analysis with applications
 in molecular plant pathology. PeerJ 1:e167
-http://dx.doi.org/10.7717/peerj.167
+https://doi.org/10.7717/peerj.167

 This tool uses Biopython, so you may also wish to cite the Biopython
 application note (and Galaxy too of course):

 Cock et al (2009). Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

 This tool is available to install into other Galaxy Instances via the Galaxy
 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss
--- a/tools/get_orfs_or_cdss/tool_dependencies.xml	Wed May 30 08:33:20 2018 -0400
+++ b/tools/get_orfs_or_cdss/tool_dependencies.xml	Fri Apr 16 22:37:04 2021 +0000
@@ -1,6 +1,6 @@
-<?xml version="1.0"?>
+<?xml version="1.0" ?>
 <tool_dependency>
     <package name="biopython" version="1.67">
-        <repository changeset_revision="a12f73c3b116" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+        <repository name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="a12f73c3b116"/>
     </package>
-</tool_dependency>
+</tool_dependency>
\ No newline at end of file