Previous changeset 7:705a2e2df7fb (2015-07-30) Next changeset 9:a06ad07431ba (2017-05-10) |
Commit message:
v0.2.0 with GFF3 output |
modified:
tools/get_orfs_or_cdss/README.rst tools/get_orfs_or_cdss/get_orfs_or_cdss.py tools/get_orfs_or_cdss/get_orfs_or_cdss.xml tools/get_orfs_or_cdss/tool_dependencies.xml |
b |
diff -r 705a2e2df7fb -r 09a8be9247ca tools/get_orfs_or_cdss/README.rst --- a/tools/get_orfs_or_cdss/README.rst Thu Jul 30 12:35:31 2015 -0400 +++ b/tools/get_orfs_or_cdss/README.rst Sat Jan 09 23:42:32 2016 -0500 |
b |
@@ -3,6 +3,7 @@ This tool is copyright 2011-2015 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +Additions copyright 2015-2016 by Eric Rasche. See the licence text below (MIT licence). This tool is a short Python script (using Biopython library functions) @@ -75,6 +76,7 @@ - Using ``optparse`` for the Python command line API (Eric Rasche). - Added NCBI genetic code table 24, Pterobranchia Mitochondrial. v0.1.1 - Reorder XML elements (internal change only). +v0.2.0 - Tool now also outputs GFF3 formatted calls (Eric Rasche). ======= ====================================================================== @@ -91,12 +93,12 @@ Planemo commands (which requires you have set your Tool Shed access details in ``~/.planemo.yml`` and that you have access rights on the Tool Shed):: - $ planemo shed_update --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/ + $ planemo shed_update -t testtoolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/ ... or:: - $ planemo shed_update --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/ + $ planemo shed_update -t toolshed --check_diff ~/repositories/pico_galaxy/tools/get_orfs_or_cdss/ ... To just build and check the tar ball, use:: |
b |
diff -r 705a2e2df7fb -r 09a8be9247ca tools/get_orfs_or_cdss/get_orfs_or_cdss.py --- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Thu Jul 30 12:35:31 2015 -0400 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.py Sat Jan 09 23:42:32 2016 -0500 |
[ |
b'@@ -16,20 +16,14 @@\n (formerly SCRI), Dundee, UK. All rights reserved.\n \n See accompanying text file for licence details (MIT licence).\n-\n-This is version 0.1.0 of the script.\n """\n import sys\n import re\n from optparse import OptionParser\n \n-def sys_exit(msg, err=1):\n- sys.stderr.write(msg.rstrip() + "\\n")\n- sys.exit(err)\n-\n usage = """Use as follows:\n \n-$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed\n+$ python get_orfs_or_cdss.py -i genome.fa -f fasta --table 11 -t CDS -e open -m all -s both --on cds.nuc.fa --op cds.protein.fa --ob cds.bed --og cds.gff3\n """\n \n try:\n@@ -38,7 +32,7 @@\n from Bio import SeqIO\n from Bio.Data import CodonTable\n except ImportError:\n- sys_exit("Missing Biopython library")\n+ sys.exit("Missing Biopython library")\n \n \n parser = OptionParser(usage=usage)\n@@ -73,6 +67,9 @@\n parser.add_option(\'--ob\', dest=\'out_bed_file\',\n default=None, help=\'Output BED file, or - for STDOUT\',\n metavar=\'FILE\')\n+parser.add_option(\'--og\', dest=\'out_gff3_file\',\n+ default=None, help=\'Output GFF3 file, or - for STDOUT\',\n+ metavar=\'FILE\')\n parser.add_option(\'-v\', \'--version\', dest=\'version\',\n default=False, action=\'store_true\',\n help=\'Show version and quit\')\n@@ -80,26 +77,32 @@\n options, args = parser.parse_args()\n \n if options.version:\n- print "v0.1.0"\n+ print("v0.2.0")\n sys.exit(0)\n \n+if not options.input_file:\n+ sys.exit("Input file is required")\n+\n+if not any((options.out_nuc_file, options.out_prot_file, options.out_bed_file, options.out_gff3_file)):\n+ sys.exit("At least one output file is required")\n+\n try:\n table_obj = CodonTable.ambiguous_generic_by_id[options.table]\n except KeyError:\n- sys_exit("Unknown codon table %i" % options.table)\n+ sys.exit("Unknown codon table %i" % options.table)\n \n-if options.seq_format.lower()=="sff":\n+if options.seq_format.lower() == "sff":\n seq_format = "sff-trim"\n-elif options.seq_format.lower()=="fasta":\n+elif options.seq_format.lower() == "fasta":\n seq_format = "fasta"\n elif options.seq_format.lower().startswith("fastq"):\n seq_format = "fastq"\n else:\n- sys_exit("Unsupported file type %r" % options.seq_format)\n+ sys.exit("Unsupported file type %r" % options.seq_format)\n \n print "Genetic code table %i" % options.table\n print "Minimum length %i aa" % options.min_len\n-#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)\n+# print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)\n \n starts = sorted(table_obj.start_codons)\n assert "NNN" not in starts\n@@ -109,13 +112,14 @@\n assert "NNN" not in stops\n re_stops = re.compile("|".join(stops))\n \n+\n def start_chop_and_trans(s, strict=True):\n """Returns offset, trimmed nuc, protein."""\n if strict:\n assert s[-3:] in stops, s\n assert len(s) % 3 == 0\n for match in re_starts.finditer(s):\n- #Must check the start is in frame\n+ # Must check the start is in frame\n start = match.start()\n if start % 3 == 0:\n n = s[start:]\n@@ -123,11 +127,12 @@\n if strict:\n t = translate(n, options.table, cds=True)\n else:\n- #Use when missing stop codon,\n+ # Use when missing stop codon,\n t = "M" + translate(n[3:], options.table, to_stop=True)\n return start, n, t\n return None, None, None\n \n+\n def break_up_frame(s):\n """Returns offset, nuc, protein."""\n start = 0\n@@ -136,7 +141,7 @@\n if index % 3 != 0:\n continue\n n = s[start:index]\n- if options.ftype=="CDS":\n+ if options.ftype == "CDS":\n offset, n, t = start_chop_and_trans(n)\n else:\n offset = 0\n@@ -145,16 +150,16 @@\n yield start + offset, n, t\n start = index\n if options.ends == "open":\n- '..b't #zero based\n+ start = frame + offset # zero based\n answer.append((start, start + len(n), +1, n, t))\n if options.strand != "forward":\n rc = reverse_complement(nuc_seq)\n- for frame in range(0,3) :\n+ for frame in range(0, 3):\n for offset, n, t in break_up_frame(rc[frame:]):\n- start = full_len - frame - offset #zero based\n- answer.append((start - len(n), start, -1, n ,t))\n+ start = full_len - frame - offset # zero based\n+ answer.append((start - len(n), start, -1, n, t))\n answer.sort()\n return answer\n \n+\n def get_top_peptides(nuc_seq):\n """Returns all peptides of max length."""\n values = list(get_all_peptides(nuc_seq))\n@@ -196,6 +202,7 @@\n if len(x[-1]) == max_len:\n yield x\n \n+\n def get_one_peptide(nuc_seq):\n """Returns first (left most) peptide with max length."""\n values = list(get_top_peptides(nuc_seq))\n@@ -214,40 +221,63 @@\n out_count = 0\n if options.out_nuc_file == "-":\n out_nuc = sys.stdout\n+elif options.out_nuc_file:\n+ out_nuc = open(options.out_nuc_file, "w")\n else:\n- out_nuc = open(options.out_nuc_file, "w")\n+ out_nuc = None\n \n if options.out_prot_file == "-":\n out_prot = sys.stdout\n+elif options.out_prot_file:\n+ out_prot = open(options.out_prot_file, "w")\n else:\n- out_prot = open(options.out_prot_file, "w")\n+ out_prot = None\n \n if options.out_bed_file == "-":\n out_bed = sys.stdout\n+elif options.out_bed_file:\n+ out_bed = open(options.out_bed_file, "w")\n else:\n- out_bed = open(options.out_bed_file, "w")\n+ out_bed = None\n+\n+if options.out_gff3_file == "-":\n+ out_gff3 = sys.stdout\n+elif options.out_gff3_file:\n+ out_gff3 = open(options.out_gff3_file, "w")\n+else:\n+ out_gff3 = None\n+\n+if out_gff3:\n+ out_gff3.write(\'##gff-version 3\\n\')\n \n for record in SeqIO.parse(options.input_file, seq_format):\n for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):\n out_count += 1\n if f_strand == +1:\n- loc = "%i..%i" % (f_start+1, f_end)\n+ loc = "%i..%i" % (f_start + 1, f_end)\n else:\n- loc = "complement(%i..%i)" % (f_start+1, f_end)\n+ loc = "complement(%i..%i)" % (f_start + 1, f_end)\n descr = "length %i aa, %i bp, from %s of %s" \\\n % (len(t), len(n), loc, record.description)\n- fid = record.id + "|%s%i" % (options.ftype, i+1)\n- r = SeqRecord(Seq(n), id = fid, name = "", description= descr)\n- t = SeqRecord(Seq(t), id = fid, name = "", description= descr)\n- SeqIO.write(r, out_nuc, "fasta")\n- SeqIO.write(t, out_prot, "fasta")\n- out_bed.write(\'\\t\'.join(map(str,[record.id, f_start, f_end, fid, 0, \'+\' if f_strand == +1 else \'-\'])) + \'\\n\')\n+ fid = record.id + "|%s%i" % (options.ftype, i + 1)\n+ r = SeqRecord(Seq(n), id=fid, name="", description=descr)\n+ t = SeqRecord(Seq(t), id=fid, name="", description=descr)\n+ if out_nuc:\n+ SeqIO.write(r, out_nuc, "fasta")\n+ if out_prot:\n+ SeqIO.write(t, out_prot, "fasta")\n+ nice_strand = \'+\' if f_strand == +1 else \'-\'\n+ if out_bed:\n+ out_bed.write(\'\\t\'.join(map(str, [record.id, f_start, f_end, fid, 0, nice_strand])) + \'\\n\')\n+ if out_gff3:\n+ out_gff3.write(\'\\t\'.join(map(str, [record.id, \'getOrfsOrCds\', \'CDS\', f_start + 1, f_end, \'.\',\n+ nice_strand, 0, \'ID=%s%s\' % (options.ftype, i + 1)])) + \'\\n\')\n in_count += 1\n-if out_nuc is not sys.stdout:\n+if out_nuc and out_nuc is not sys.stdout:\n out_nuc.close()\n-if out_prot is not sys.stdout:\n+if out_prot and out_prot is not sys.stdout:\n out_prot.close()\n-if out_bed is not sys.stdout:\n+if out_bed and out_bed is not sys.stdout:\n out_bed.close()\n \n print "Found %i %ss in %i sequences" % (out_count, options.ftype, in_count)\n' |
b |
diff -r 705a2e2df7fb -r 09a8be9247ca tools/get_orfs_or_cdss/get_orfs_or_cdss.xml --- a/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml Thu Jul 30 12:35:31 2015 -0400 +++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml Sat Jan 09 23:42:32 2016 -0500 |
b |
@@ -1,4 +1,4 @@ -<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.1.1"> +<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.2.0"> <description>e.g. to get peptides from ESTs</description> <requirements> <requirement type="package" version="1.65">biopython</requirement> @@ -11,7 +11,7 @@ </stdio> <version_command interpreter="python">get_orfs_or_cdss.py --version</version_command> <command interpreter="python"> -get_orfs_or_cdss.py -i $input_file -f $input_file.ext --table $table -t $ftype -e $ends -m $mode --min_len $min_len -s $strand --on $out_nuc_file --op $out_prot_file --ob $out_bed_file +get_orfs_or_cdss.py -i $input_file -f $input_file.ext --table $table -t $ftype -e $ends -m $mode --min_len $min_len -s $strand --on $out_nuc_file --op $out_prot_file --ob $out_bed_file --og $out_gff3_file </command> <inputs> <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." /> @@ -60,6 +60,7 @@ <data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" /> <data name="out_prot_file" format="fasta" label="${ftype.value}s (amino acids)" /> <data name="out_bed_file" format="bed6" label="${ftype.value}s (bed)" /> + <data name="out_gff3_file" format="gff3" label="${ftype.value}s (gff3)" /> </outputs> <tests> <test> @@ -73,6 +74,7 @@ <output name="out_nuc_file" file="get_orf_input.t1_nuc_out.fasta" /> <output name="out_prot_file" file="get_orf_input.t1_prot_out.fasta" /> <output name="out_bed_file" file="get_orf_input.t1_bed_out.bed" /> + <output name="out_gff3_file" file="get_orf_input.t1_gff3_out.gff3" /> </test> <test> <param name="input_file" value="get_orf_input.fasta" /> @@ -85,6 +87,7 @@ <output name="out_nuc_file" file="get_orf_input.t11_nuc_out.fasta" /> <output name="out_prot_file" file="get_orf_input.t11_prot_out.fasta" /> <output name="out_bed_file" file="get_orf_input.t11_bed_out.bed" /> + <output name="out_gff3_file" file="get_orf_input.t11_gff3_out.gff3" /> </test> <test> <param name="input_file" value="get_orf_input.fasta" /> @@ -97,6 +100,7 @@ <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" /> <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" /> <output name="out_bed_file" file="get_orf_input.t11_open_bed_out.bed" /> + <output name="out_gff3_file" file="get_orf_input.t11_open_gff3_out.gff3" /> </test> <test> <param name="input_file" value="Ssuis.fasta" /> @@ -109,6 +113,7 @@ <output name="out_nuc_file" file="get_orf_input.Suis_ORF.nuc.fasta" /> <output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.fasta" /> <output name="out_bed_file" file="get_orf_input.Suis_ORF.bed" /> + <output name="out_gff3_file" file="get_orf_input.Suis_ORF.gff3" /> </test> </tests> <help> |
b |
diff -r 705a2e2df7fb -r 09a8be9247ca tools/get_orfs_or_cdss/tool_dependencies.xml --- a/tools/get_orfs_or_cdss/tool_dependencies.xml Thu Jul 30 12:35:31 2015 -0400 +++ b/tools/get_orfs_or_cdss/tool_dependencies.xml Sat Jan 09 23:42:32 2016 -0500 |
b |
@@ -1,6 +1,6 @@ <?xml version="1.0"?> <tool_dependency> <package name="biopython" version="1.65"> - <repository changeset_revision="dc595937617c" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="030f1a505d40" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency> |