Mercurial > repos > bgruening > trna_prediction

--- a/aragorn.xml	Sat Jul 06 10:37:13 2013 -0400
+++ b/aragorn.xml	Thu Jan 22 13:15:51 2015 -0500
@@ -1,11 +1,13 @@
-<tool id="aragorn_trna" name="tRNA and tmRNA" version="0.3">
-    <description>prediction (Aragon)</description>
+<tool id="aragorn_trna" name="tRNA and tmRNA" version="0.4">
+    <description>prediction (Aragorn)</description>
     <requirements>
         <requirement type="package" version="1.2.36">aragorn</requirement>
+        <requirement type="set_environment">TRNAPRED_SCRIPT_PATH</requirement>
     </requirements>
     <command>
-        aragorn
-            $input
+<![CDATA[
+        aragorn
+            $input
             -gc$genbank_gencode
             $tmRNA
             $tRNA
@@ -14,7 +16,21 @@
             $topology
             -o $output
             $secondary_structure
-            $introns
+            $introns;
+
+#if $gff3_output:
+        aragorn
+            $input
+            -gc$genbank_gencode
+            $tmRNA
+            $tRNA
+            $mtRNA
+            $mam_mtRNA
+            $topology
+            -fasta
+            $introns | python \$TRNAPRED_SCRIPT_PATH/aragorn_out_to_gff3.py > $gff3_output_file;
+#end if
+]]>
     </command>
     <inputs>
         <param name="input" type="data" format="fasta" label="Genome Sequence"/>
@@ -48,6 +64,7 @@
         <param name='mam_mtRNA' type='boolean' label='Search for Mammalian mitochondrial tRNA genes (-mtmam)' truevalue='-mtmam' falsevalue='' checked="false" help='-i switch ignored. -tv switch set. Mammalian mitochondrial genetic code used.' />
         <param name='introns' type='boolean' label='Search for tRNA genes with introns in anticodon loop ...' truevalue='-i' falsevalue='' checked="false" help='...with maximum length 3000 bases (-i).' />
         <param name='secondary_structure' type='boolean' label='Print out secondary structure (-fasta,-fon)' truevalue='-fasta' falsevalue='-fon' checked="false" help='' />
+        <param name="gff3_output" type='boolean' label='Convert output to GFF3' truevalue='True' falsevalue='' checked="false" help='' />
     </inputs>
     <outputs>
         <data name="output" format="fasta">
@@ -55,6 +72,9 @@
                <when input="secondary_structure" value="true" format="text"/>
              </change_format>
         </data>
+        <data format="gff3" name="gff3_output_file" >
+            <filter>gff3_output</filter>
+        </data>
     </outputs>
     <tests>
         <test>
@@ -65,12 +85,15 @@
             <param name="tRNA" value="-t" />
             <param name="mtRNA" value="" />
             <param name="mam_mtRNA" value="" />
-            <param name="introns" value="" />
+            <param name="introns" value="" />
             <param name="secondary_structure" value="-fon" />
+            <param name="gff3_output" value="True" />
             <output name="output" file="aragorn_tansl-table-1_tmRNA_tRNA.fasta" ftype="fasta" />
+            <output name="output" file="aragorn_tansl-table-1_tmRNA_tRNA.gff3" ftype="gff3" />
         </test>
     </tests>
     <help>
+<![CDATA[

 **What it does**

@@ -125,41 +148,41 @@


     tRNA Anticodon Frequency
-    AAA Phe       GAA Phe  1    CAA Leu  1    TAA Leu  1
-    AGA Ser       GGA Ser  1    CGA Ser  2    TGA Ser  1
-    ACA Cys       GCA Cys  2    CCA Trp  2    TCA seC
-    ATA Tyr       GTA Tyr  1    CTA Pyl       TTA Stop
-    AAG Leu       GAG Leu  3    CAG Leu  1    TAG Leu  2
-    AGG Pro       GGG Pro  2    CGG Pro  2    TGG Pro  2
-    ACG Arg  1    GCG Arg  2    CCG Arg  1    TCG Arg
-    ATG His       GTG His  2    CTG Gln  2    TTG Gln  1
-    AAC Val       GAC Val  3    CAC Val  2    TAC Val  1
-    AGC Ala       GGC Ala  2    CGC Ala  3    TGC Ala  1
-    ACC Gly       GCC Gly  5    CCC Gly  1    TCC Gly  2
-    ATC Asp       GTC Asp  3    CTC Glu  2    TTC Glu  2
-    AAT Ile       GAT Ile  3    CAT Met  6    TAT Ile
-    AGT Thr       GGT Thr  2    CGT Thr  1    TGT Thr  2
-    ACT Ser       GCT Ser  1    CCT Arg  1    TCT Arg  1
-    ATT Asn       GTT Asn  3    CTT Lys  3    TTT Lys  2
+    AAA Phe       GAA Phe  1    CAA Leu  1    TAA Leu  1
+    AGA Ser       GGA Ser  1    CGA Ser  2    TGA Ser  1
+    ACA Cys       GCA Cys  2    CCA Trp  2    TCA seC
+    ATA Tyr       GTA Tyr  1    CTA Pyl       TTA Stop
+    AAG Leu       GAG Leu  3    CAG Leu  1    TAG Leu  2
+    AGG Pro       GGG Pro  2    CGG Pro  2    TGG Pro  2
+    ACG Arg  1    GCG Arg  2    CCG Arg  1    TCG Arg
+    ATG His       GTG His  2    CTG Gln  2    TTG Gln  1
+    AAC Val       GAC Val  3    CAC Val  2    TAC Val  1
+    AGC Ala       GGC Ala  2    CGC Ala  3    TGC Ala  1
+    ACC Gly       GCC Gly  5    CCC Gly  1    TCC Gly  2
+    ATC Asp       GTC Asp  3    CTC Glu  2    TTC Glu  2
+    AAT Ile       GAT Ile  3    CAT Met  6    TAT Ile
+    AGT Thr       GGT Thr  2    CGT Thr  1    TGT Thr  2
+    ACT Ser       GCT Ser  1    CCT Arg  1    TCT Arg  1
+    ATT Asn       GTT Asn  3    CTT Lys  3    TTT Lys  2
     Ambiguous: 1

     tRNA Codon Frequency
-    TTT Phe       TTC Phe  1    TTG Leu  1    TTA Leu  1
-    TCT Ser       TCC Ser  1    TCG Ser  2    TCA Ser  1
-    TGT Cys       TGC Cys  2    TGG Trp  2    TGA seC
-    TAT Tyr       TAC Tyr  1    TAG Pyl       TAA Stop
-    CTT Leu       CTC Leu  3    CTG Leu  1    CTA Leu  2
-    CCT Pro       CCC Pro  2    CCG Pro  2    CCA Pro  2
-    CGT Arg  1    CGC Arg  2    CGG Arg  1    CGA Arg
-    CAT His       CAC His  2    CAG Gln  2    CAA Gln  1
-    GTT Val       GTC Val  3    GTG Val  2    GTA Val  1
-    GCT Ala       GCC Ala  2    GCG Ala  3    GCA Ala  1
-    GGT Gly       GGC Gly  5    GGG Gly  1    GGA Gly  2
-    GAT Asp       GAC Asp  3    GAG Glu  2    GAA Glu  2
-    ATT Ile       ATC Ile  3    ATG Met  6    ATA Ile
-    ACT Thr       ACC Thr  2    ACG Thr  1    ACA Thr  2
-    AGT Ser       AGC Ser  1    AGG Arg  1    AGA Arg  1
-    AAT Asn       AAC Asn  3    AAG Lys  3    AAA Lys  2
+    TTT Phe       TTC Phe  1    TTG Leu  1    TTA Leu  1
+    TCT Ser       TCC Ser  1    TCG Ser  2    TCA Ser  1
+    TGT Cys       TGC Cys  2    TGG Trp  2    TGA seC
+    TAT Tyr       TAC Tyr  1    TAG Pyl       TAA Stop
+    CTT Leu       CTC Leu  3    CTG Leu  1    CTA Leu  2
+    CCT Pro       CCC Pro  2    CCG Pro  2    CCA Pro  2
+    CGT Arg  1    CGC Arg  2    CGG Arg  1    CGA Arg
+    CAT His       CAC His  2    CAG Gln  2    CAA Gln  1
+    GTT Val       GTC Val  3    GTG Val  2    GTA Val  1
+    GCT Ala       GCC Ala  2    GCG Ala  3    GCA Ala  1
+    GGT Gly       GGC Gly  5    GGG Gly  1    GGA Gly  2
+    GAT Asp       GAC Asp  3    GAG Glu  2    GAA Glu  2
+    ATT Ile       ATC Ile  3    ATG Met  6    ATA Ile
+    ACT Thr       ACC Thr  2    ACG Thr  1    ACA Thr  2
+    AGT Ser       AGC Ser  1    AGG Arg  1    AGA Arg  1
+    AAT Asn       AAC Asn  3    AAG Lys  3    AAA Lys  2
     Ambiguous: 1

     Number of tRNA genes = 86
@@ -174,7 +197,8 @@

 ARAGORN, a program to detect tRNA genes and tmRNA genes in nucleotide sequences Nucl. Acids Res. (2004) 32(1): 11-16

-doi:10.1093/nar/gkh152
+doi:10.1093/nar/gkh152

+]]>
     </help>
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/aragorn_out_to_gff3.py	Thu Jan 22 13:15:51 2015 -0500
@@ -0,0 +1,165 @@
+#!/usr/bin/env python
+import re
+
+def start_pattern(string):
+    return re.match(r'^[0-9]+\.$', string) \
+        or string.startswith('Number of possible') \
+        or string.startswith('Searching for')
+
+def blank_line(string):
+    return re.match(r'^\s*$', string)
+
+def blocks(iterable):
+    accumulator = []
+    run_of_blanklines = 0
+    for line in iterable:
+        # Count blank lines
+        if blank_line(line):
+            run_of_blanklines += 1
+        else:
+            run_of_blanklines = 0
+
+        if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line:
+            if accumulator:
+                yield accumulator
+                accumulator = [line]
+        else:
+            accumulator.append(line)
+    if accumulator:
+        yield accumulator
+
+IMPORTANT_INFO = {
+    'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'),
+    'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'),
+    'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'),
+    'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
+    'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'),
+}
+INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo')
+
+def important_info(block):
+    info = {}
+    for line in block:
+        for matcher in IMPORTANT_INFO:
+            matches = IMPORTANT_INFO[matcher].search(line)
+            if matches:
+                for group in INFO_GROUPS:
+                    try:
+                        info[group] = matches.group(group)
+                    except:
+                        pass
+    return info
+
+IMPORTANT_INFO_TMRNA = {
+    'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'),
+    'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
+}
+INFO_GROUPS_TMRNA = ('start', 'end', 'pep')
+
+def important_info_tmrna(block):
+    info = {}
+    for line in block:
+        for matcher in IMPORTANT_INFO_TMRNA:
+            matches = IMPORTANT_INFO_TMRNA[matcher].search(line)
+            if matches:
+                for group in INFO_GROUPS_TMRNA:
+                    try:
+                        info[group] = matches.group(group)
+                    except:
+                        pass
+    return info
+
+import fileinput
+stdin_data = []
+for line in fileinput.input():
+    stdin_data.append(line)
+
+possible_blocks = [line for line in blocks(stdin_data)]
+
+seqid = None
+print '##gff-version-3'
+# We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible
+# awful things
+for block_idx in range(len(possible_blocks)):
+    block = possible_blocks[block_idx]
+    data = None
+    fasta_defline = None
+
+    if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]:
+        # Try and get a sequence ID out of it
+        try:
+            fasta_defline = block[-2].strip()
+        except:
+            # Failing that, ignore it.
+            pass
+    else:
+        # They DUPLICATE results in multiple places, including a fasta version
+        # in the 'full report'.
+        possible_ugliness = [x for x in block if x.startswith('>t')]
+        if len(possible_ugliness) > 0:
+            continue
+
+        # However, if it didn't have one of those all important pieces of
+        # information, then it's either a different important piece of
+        # information, or complete junk
+        data = important_info(block)
+
+        # I am not proud of any of this. We essentially say "if that block
+        # didn't come up with useful info, then try making it a tmrna"
+        if len(data.keys()) == 0:
+            data = important_info_tmrna(block)
+            # And if that fails, just none it.
+            if len(data.keys()) == 0:
+                data = None
+            else:
+                # But if it didn't, confirm that we're a tmRNA
+                data['type'] = 'tmRNA'
+        else:
+            # If we did have keys, and didn't pass through any of the tmRNA
+            # checks, we're tRNA
+            data['type'] = 'tRNA'
+
+    # If we got a sequence ID in this block, set the defline
+    if 'nucleotides in sequence' in block[-1]:
+        try:
+            fasta_defline = block[-2].strip()
+        except:
+            pass
+
+    # if a defline is available, try and extract the fasta header ID
+    if fasta_defline is not None:
+        try:
+            seqid = fasta_defline[0:fasta_defline.index(' ')]
+        except:
+            seqid = fasta_defline
+
+    # If there's data
+    if data is not None and len(data.keys()) > 1:
+
+        # Deal with our flags/notes.
+        if data['type'] == 'tRNA':
+            # Are these acceptable GFF3 tags?
+            notes = {
+                'Codon': data['codon'],
+                'Anticodon': data['anticodon'],
+            }
+            if 'pseudo' in data:
+                notes['Note'] = 'Possible pseudogene'
+        else:
+            notes = {
+                'Note': 'Tag peptide: ' + data['pep'] + ''
+            }
+
+        notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()])
+
+        print '\t'.join([
+            seqid,
+            'aragorn',
+            data['type'],
+            data['start'],
+            data['end'],
+            '.',
+            '.',
+            '.',
+            notestr
+        ])
--- a/tRNAscan.xml	Sat Jul 06 10:37:13 2013 -0400
+++ b/tRNAscan.xml	Thu Jan 22 13:15:51 2015 -0500
@@ -5,6 +5,7 @@
         <requirement type="package" version="1.61">biopython</requirement>
     </requirements>
     <command interpreter="python">
+<![CDATA[
         tRNAscan.py
             $organism
             $mode
@@ -15,6 +16,7 @@
             $tabular_output
             $inputfile
             $fasta_output
+]]>
     </command>
     <inputs>
         <param name="inputfile" type="data" format="fasta" label="Genome Sequence" help="Dataset missing? See TIP below"/>
@@ -54,6 +56,7 @@
         </test>
     </tests>
     <help>
+<![CDATA[


 .. class:: warningmark
@@ -176,7 +179,7 @@
 	pseudogene. The heuristic pseudogene detection filter uses this
 	information to flag possible pseudogenes -- use this option to see why
 	a hit is marked as a possible pseudogene. The user may wish to
-	examine score breakdowns from known tRNAs in the organism of interest
+	examine score breakdowns from known tRNAs in the organism of interest
 	to get a frame of reference.

 -----
@@ -207,7 +210,7 @@


     ========     ======    =====     ======    ====    ==========    ======    ======    ==========    ==========
-                              tRNA Bounds                              Intron Bonds
+                              tRNA Bounds                              Intron Bonds
     --------     ------    ----------------    ----    ----------    ----------------    ----------    ----------
     Name         # tRNA    Begin     End       tRNA    Anti Codon    Begin     End       Cove Score    Hit Origin
     ========     ======    =====     ======    ====    ==========    ======    ======    ==========    ==========
@@ -227,8 +230,9 @@

 tRNAscan-SE: A Program for Improved Detection of Transfer RNA Genes in Genomic Sequence Nucl. Acids Res. (1997) 25(5): 0955-964

-doi:10.1093/nar/25.5.0955
+doi:10.1093/nar/25.5.0955


+]]>
     </help>
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/aragorn_tansl-table-1_tmRNA_tRNA.gff3	Thu Jan 22 13:15:51 2015 -0500
@@ -0,0 +1,2 @@
+##gff-version-3
+gi|240255695:23036500-23037000	aragorn	tRNA	381	453	.	.	.	Anticodon=tgc;Codon=Ala
--- a/tool_dependencies.xml	Sat Jul 06 10:37:13 2013 -0400
+++ b/tool_dependencies.xml	Thu Jan 22 13:15:51 2015 -0500
@@ -1,47 +1,15 @@
 <?xml version="1.0"?>
 <tool_dependency>
     <package name="biopython" version="1.61">
-        <repository changeset_revision="5d0c54f7fea2" name="package_biopython_1_61" owner="biopython" toolshed="http://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
     <package name="aragorn" version="1.2.36">
-        <install version="1.0">
-            <actions>
-                <action type="download_by_url">http://mbio-serv2.mbioekol.lu.se/ARAGORN/Downloads/aragorn1.2.36.tgz</action>
-                <action type="make_directory">$INSTALL_DIR/bin/</action>
-                <action type="shell_command">gcc -O3 -ffast-math -finline-functions -o aragorn aragorn1.2.36.c</action>
-                <action type="move_file">
-                    <source>aragorn</source>
-                    <destination>$INSTALL_DIR/bin</destination>
-                </action>
-                <action type="set_environment">
-                    <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable>
-                </action>
-            </actions>
-        </install>
-        <readme>Compiling ARAGORN requires gcc.</readme>
+        <repository changeset_revision="f09e2902e6ed" name="package_aragorn_1_2_36" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
     <package name="tRNAscan-SE" version="1.3.1">
-        <install version="1.0">
-            <actions>
-                <action type="download_by_url">http://lowelab.ucsc.edu/software/tRNAscan-SE.tar.gz</action>
-                <action type="make_directory">$INSTALL_DIR/bin/</action>
-                <action type="make_directory">$INSTALL_DIR/lib/tRNAscan-SE/</action>
-                <action type="make_directory">$INSTALL_DIR/man/</action>
-                <!-- replacing the hardcoded pathvariables with the real ones -->
-                <action type="shell_command">cd ./tRNAscan-SE-1.3.1 &amp;&amp; sed 's%^BINDIR  = .*%BINDIR = $INSTALL_DIR/bin/%' Makefile | sed 's%^LIBDIR  = .*%LIBDIR = $INSTALL_DIR/lib/tRNAscan-SE/%' | sed 's%^MANDIR  = .*%MANDIR = $INSTALL_DIR/man%' &gt; Makefile_new</action>
-                <action type="shell_command">cd ./tRNAscan-SE-1.3.1 &amp;&amp; rm Makefile &amp;&amp; mv Makefile_new Makefile</action>
-                <action type="shell_command">cd ./tRNAscan-SE-1.3.1 &amp;&amp; make &amp;&amp; make install</action>
-
-                <!-- for some reason infernal needs to be directly under the bin/ from tRNAScan -->
-                <action type="shell_command">wget ftp://selab.janelia.org/pub/software/infernal/infernal-1.0.2.tar.gz</action>
-                <action type="shell_command">tar xfvz infernal-1.0.2.tar.gz</action>
-                <action type="shell_command">cd infernal-1.0.2 &amp;&amp; ./configure --prefix=$INSTALL_DIR &amp;&amp; make &amp;&amp; make install</action>
-                <action type="set_environment">
-                    <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable>
-                    <environment_variable action="prepend_to" name="PERL5LIB">$INSTALL_DIR/bin/</environment_variable>
-                </action>
-            </actions>
-        </install>
-        <readme>Compiling and running tRNAScan-SE requires gcc a PERL environment.</readme>
+        <repository changeset_revision="a3fb2752c340" name="package_trnascan_1_3_1" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" />
     </package>
+    <set_environment version="1.0">
+        <environment_variable action="set_to" name="TRNAPRED_SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable>
+    </set_environment>
 </tool_dependency>