changeset 8:09a8be9247ca draft

v0.2.0 with GFF3 output
author peterjc
date Sat, 09 Jan 2016 23:42:32 -0500
parents 705a2e2df7fb
children a06ad07431ba
files tools/get_orfs_or_cdss/README.rst tools/get_orfs_or_cdss/get_orfs_or_cdss.py tools/get_orfs_or_cdss/get_orfs_or_cdss.xml tools/get_orfs_or_cdss/tool_dependencies.xml
diffstat 4 files changed, 85 insertions(+), 48 deletions(-) [+]
line wrap: on
line diff
--- a/tools/get_orfs_or_cdss/README.rst	Thu Jul 30 12:35:31 2015 -0400
+++ b/tools/get_orfs_or_cdss/README.rst	Sat Jan 09 23:42:32 2016 -0500
@@ -3,6 +3,7 @@
 
 This tool is copyright 2011-2015 by Peter Cock, The James Hutton Institute
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+Additions copyright 2015-2016 by Eric Rasche.
 See the licence text below (MIT licence).
 
 This tool is a short Python script (using Biopython library functions)
@@ -75,6 +76,7 @@
         - Using ``optparse`` for the Python command line API (Eric Rasche).
         - Added NCBI genetic code table 24, Pterobranchia Mitochondrial.
 v0.1.1  - Reorder XML elements (internal change only).
+v0.2.0  - Tool now also outputs GFF3 formatted calls (Eric Rasche).
 ======= ======================================================================
 
 
@@ -91,12 +93,12 @@
 Planemo commands (which requires you have set your Tool Shed access details in
 ``~/.planemo.yml`` and that you have access rights on the Tool Shed)::
 
-    $ planemo shed_update --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/
+    $ planemo shed_update -t testtoolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/
     ...
 
 or::
 
-    $ planemo shed_update --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/
+    $ planemo shed_update -t toolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/
     ...
 
 To just build and check the tar ball, use::
--- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.py	Thu Jul 30 12:35:31 2015 -0400
+++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.py	Sat Jan 09 23:42:32 2016 -0500
@@ -16,20 +16,14 @@
 (formerly SCRI), Dundee, UK. All rights reserved.
 
 See accompanying text file for licence details (MIT licence).
-
-This is version 0.1.0 of the script.
 """
 import sys
 import re
 from optparse import OptionParser
 
-def sys_exit(msg, err=1):
-    sys.stderr.write(msg.rstrip() + "\n")
-    sys.exit(err)
-
 usage = """Use as follows:
 
-$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed
+$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed --og cds.gff3
 """
 
 try:
@@ -38,7 +32,7 @@
     from Bio import SeqIO
     from Bio.Data import CodonTable
 except ImportError:
-    sys_exit("Missing Biopython library")
+    sys.exit("Missing Biopython library")
 
 
 parser = OptionParser(usage=usage)
@@ -73,6 +67,9 @@
 parser.add_option('--ob', dest='out_bed_file',
                   default=None, help='Output BED file, or - for STDOUT',
                   metavar='FILE')
+parser.add_option('--og', dest='out_gff3_file',
+                  default=None, help='Output GFF3 file, or - for STDOUT',
+                  metavar='FILE')
 parser.add_option('-v', '--version', dest='version',
                   default=False, action='store_true',
                   help='Show version and quit')
@@ -80,26 +77,32 @@
 options, args = parser.parse_args()
 
 if options.version:
-    print "v0.1.0"
+    print("v0.2.0")
     sys.exit(0)
 
+if not options.input_file:
+    sys.exit("Input file is required")
+
+if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)):
+    sys.exit("At least one output file is required")
+
 try:
     table_obj = CodonTable.ambiguous_generic_by_id[options.table]
 except KeyError:
-    sys_exit("Unknown codon table %i" % options.table)
+    sys.exit("Unknown codon table %i" % options.table)
 
-if options.seq_format.lower()=="sff":
+if options.seq_format.lower() == "sff":
     seq_format = "sff-trim"
-elif options.seq_format.lower()=="fasta":
+elif options.seq_format.lower() == "fasta":
     seq_format = "fasta"
 elif options.seq_format.lower().startswith("fastq"):
     seq_format = "fastq"
 else:
-    sys_exit("Unsupported file type %r" % options.seq_format)
+    sys.exit("Unsupported file type %r" % options.seq_format)
 
 print "Genetic code table %i" % options.table
 print "Minimum length %i aa" % options.min_len
-#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
+# print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
 
 starts = sorted(table_obj.start_codons)
 assert "NNN" not in starts
@@ -109,13 +112,14 @@
 assert "NNN" not in stops
 re_stops = re.compile("|".join(stops))
 
+
 def start_chop_and_trans(s, strict=True):
     """Returns offset, trimmed nuc, protein."""
     if strict:
         assert s[-3:] in stops, s
     assert len(s) % 3 == 0
     for match in re_starts.finditer(s):
-        #Must check the start is in frame
+        # Must check the start is in frame
         start = match.start()
         if start % 3 == 0:
             n = s[start:]
@@ -123,11 +127,12 @@
             if strict:
                 t = translate(n, options.table, cds=True)
             else:
-                #Use when missing stop codon,
+                # Use when missing stop codon,
                 t = "M" + translate(n[3:], options.table, to_stop=True)
             return start, n, t
     return None, None, None
 
+
 def break_up_frame(s):
     """Returns offset, nuc, protein."""
     start = 0
@@ -136,7 +141,7 @@
         if index % 3 != 0:
             continue
         n = s[start:index]
-        if options.ftype=="CDS":
+        if options.ftype == "CDS":
             offset, n, t = start_chop_and_trans(n)
         else:
             offset = 0
@@ -145,16 +150,16 @@
             yield start + offset, n, t
         start = index
     if options.ends == "open":
-        #No stop codon, Biopython's strict CDS translate will fail
+        # No stop codon, Biopython's strict CDS translate will fail
         n = s[start:]
-        #Ensure we have whole codons
-        #TODO - Try appending N instead?
-        #TODO - Do the next four lines more elegantly
+        # Ensure we have whole codons
+        # TODO - Try appending N instead?
+        # TODO - Do the next four lines more elegantly
         if len(n) % 3:
             n = n[:-1]
         if len(n) % 3:
             n = n[:-1]
-        if options.ftype=="CDS":
+        if options.ftype == "CDS":
             offset, n, t = start_chop_and_trans(n, strict=False)
         else:
             offset = 0
@@ -168,24 +173,25 @@
 
     Co-ordinates are Python style zero-based.
     """
-    #TODO - Refactor to use a generator function (in start order)
-    #rather than making a list and sorting?
+    # TODO - Refactor to use a generator function (in start order)
+    # rather than making a list and sorting?
     answer = []
     full_len = len(nuc_seq)
     if options.strand != "reverse":
-        for frame in range(0,3):
+        for frame in range(0, 3):
             for offset, n, t in break_up_frame(nuc_seq[frame:]):
-                start = frame + offset #zero based
+                start = frame + offset  # zero based
                 answer.append((start, start + len(n), +1, n, t))
     if options.strand != "forward":
         rc = reverse_complement(nuc_seq)
-        for frame in range(0,3) :
+        for frame in range(0, 3):
             for offset, n, t in break_up_frame(rc[frame:]):
-                start = full_len - frame - offset #zero based
-                answer.append((start - len(n), start, -1, n ,t))
+                start = full_len - frame - offset  # zero based
+                answer.append((start - len(n), start, -1, n, t))
     answer.sort()
     return answer
 
+
 def get_top_peptides(nuc_seq):
     """Returns all peptides of max length."""
     values = list(get_all_peptides(nuc_seq))
@@ -196,6 +202,7 @@
         if len(x[-1]) == max_len:
             yield x
 
+
 def get_one_peptide(nuc_seq):
     """Returns first (left most) peptide with max length."""
     values = list(get_top_peptides(nuc_seq))
@@ -214,40 +221,63 @@
 out_count = 0
 if options.out_nuc_file == "-":
     out_nuc = sys.stdout
+elif options.out_nuc_file:
+    out_nuc = open(options.out_nuc_file, "w")
 else:
-    out_nuc = open(options.out_nuc_file, "w")
+    out_nuc = None
 
 if options.out_prot_file == "-":
     out_prot = sys.stdout
+elif options.out_prot_file:
+    out_prot = open(options.out_prot_file, "w")
 else:
-    out_prot = open(options.out_prot_file, "w")
+    out_prot = None
 
 if options.out_bed_file == "-":
     out_bed = sys.stdout
+elif options.out_bed_file:
+    out_bed = open(options.out_bed_file, "w")
 else:
-    out_bed = open(options.out_bed_file, "w")
+    out_bed = None
+
+if options.out_gff3_file == "-":
+    out_gff3 = sys.stdout
+elif options.out_gff3_file:
+    out_gff3 = open(options.out_gff3_file, "w")
+else:
+    out_gff3 = None
+
+if out_gff3:
+    out_gff3.write('##gff-version 3\n')
 
 for record in SeqIO.parse(options.input_file, seq_format):
     for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
         out_count += 1
         if f_strand == +1:
-            loc = "%i..%i" % (f_start+1, f_end)
+            loc = "%i..%i" % (f_start + 1, f_end)
         else:
-            loc = "complement(%i..%i)" % (f_start+1, f_end)
+            loc = "complement(%i..%i)" % (f_start + 1, f_end)
         descr = "length %i aa, %i bp, from %s of %s" \
                 % (len(t), len(n), loc, record.description)
-        fid = record.id + "|%s%i" % (options.ftype, i+1)
-        r = SeqRecord(Seq(n), id = fid, name = "", description= descr)
-        t = SeqRecord(Seq(t), id = fid, name = "", description= descr)
-        SeqIO.write(r, out_nuc, "fasta")
-        SeqIO.write(t, out_prot, "fasta")
-        out_bed.write('\t'.join(map(str,[record.id, f_start, f_end, fid, 0, '+' if f_strand == +1 else '-'])) + '\n')
+        fid = record.id + "|%s%i" % (options.ftype, i + 1)
+        r = SeqRecord(Seq(n), id=fid, name="", description=descr)
+        t = SeqRecord(Seq(t), id=fid, name="", description=descr)
+        if out_nuc:
+            SeqIO.write(r, out_nuc, "fasta")
+        if out_prot:
+            SeqIO.write(t, out_prot, "fasta")
+        nice_strand = '+' if f_strand == +1 else '-'
+        if out_bed:
+            out_bed.write('\t'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + '\n')
+        if out_gff3:
+            out_gff3.write('\t'.join(map(str, [record.id, 'getOrfsOrCds', 'CDS', f_start + 1, f_end, '.',
+                                               nice_strand, 0, 'ID=%s%s' % (options.ftype, i + 1)])) + '\n')
     in_count += 1
-if out_nuc is not sys.stdout:
+if out_nuc and out_nuc is not sys.stdout:
     out_nuc.close()
-if out_prot is not sys.stdout:
+if out_prot and out_prot is not sys.stdout:
     out_prot.close()
-if out_bed is not sys.stdout:
+if out_bed and out_bed is not sys.stdout:
     out_bed.close()
 
 print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count)
--- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml	Thu Jul 30 12:35:31 2015 -0400
+++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml	Sat Jan 09 23:42:32 2016 -0500
@@ -1,4 +1,4 @@
-<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.1.1">
+<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.2.0">
     <description>e.g. to get peptides from ESTs</description>
     <requirements>
         <requirement type="package" version="1.65">biopython</requirement>
@@ -11,7 +11,7 @@
     </stdio>
     <version_command interpreter="python">get_orfs_or_cdss.py --version</version_command>
     <command interpreter="python">
-get_orfs_or_cdss.py -i $input_file -f $input_file.ext --table $table -t $ftype -e $ends -m $mode --min_len $min_len -s $strand --on $out_nuc_file --op $out_prot_file --ob $out_bed_file
+get_orfs_or_cdss.py -i $input_file -f $input_file.ext --table $table -t $ftype -e $ends -m $mode --min_len $min_len -s $strand --on $out_nuc_file --op $out_prot_file --ob $out_bed_file --og $out_gff3_file
     </command>
     <inputs>
         <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." />
@@ -60,6 +60,7 @@
         <data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" />
         <data name="out_prot_file" format="fasta" label="${ftype.value}s (amino acids)" />
         <data name="out_bed_file" format="bed6" label="${ftype.value}s (bed)" />
+        <data name="out_gff3_file" format="gff3" label="${ftype.value}s (gff3)" />
     </outputs>
     <tests>
         <test>
@@ -73,6 +74,7 @@
             <output name="out_nuc_file" file="get_orf_input.t1_nuc_out.fasta" />
             <output name="out_prot_file" file="get_orf_input.t1_prot_out.fasta" />
             <output name="out_bed_file" file="get_orf_input.t1_bed_out.bed" />
+            <output name="out_gff3_file" file="get_orf_input.t1_gff3_out.gff3" />
         </test>
         <test>
             <param name="input_file" value="get_orf_input.fasta" />
@@ -85,6 +87,7 @@
             <output name="out_nuc_file" file="get_orf_input.t11_nuc_out.fasta" />
             <output name="out_prot_file" file="get_orf_input.t11_prot_out.fasta" />
             <output name="out_bed_file" file="get_orf_input.t11_bed_out.bed" />
+            <output name="out_gff3_file" file="get_orf_input.t11_gff3_out.gff3" />
         </test>
         <test>
             <param name="input_file" value="get_orf_input.fasta" />
@@ -97,6 +100,7 @@
             <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" />
             <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" />
             <output name="out_bed_file" file="get_orf_input.t11_open_bed_out.bed" />
+            <output name="out_gff3_file" file="get_orf_input.t11_open_gff3_out.gff3" />
         </test>
         <test>
             <param name="input_file" value="Ssuis.fasta" />
@@ -109,6 +113,7 @@
             <output name="out_nuc_file" file="get_orf_input.Suis_ORF.nuc.fasta" />
             <output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.fasta" />
             <output name="out_bed_file" file="get_orf_input.Suis_ORF.bed" />
+            <output name="out_gff3_file" file="get_orf_input.Suis_ORF.gff3" />
         </test>
     </tests>
     <help>
--- a/tools/get_orfs_or_cdss/tool_dependencies.xml	Thu Jul 30 12:35:31 2015 -0400
+++ b/tools/get_orfs_or_cdss/tool_dependencies.xml	Sat Jan 09 23:42:32 2016 -0500
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
     <package name="biopython" version="1.65">
-        <repository changeset_revision="dc595937617c" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="030f1a505d40" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>