Mercurial > repos > yhoogstrate > unafold
changeset 0:bf022d3751fa draft
planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/unafold_galaxy_wrapper commit 84b70c01144fa018db45215941fb395798376100-dirty
author | yhoogstrate |
---|---|
date | Tue, 16 Jun 2015 11:21:12 -0400 |
parents | |
children | 37198dc9311c |
files | datatypes_conf.xml lib/galaxy/datatypes/RNAStructure.py test-data/test1_input.fa test-data/test1_output.ct tool_dependencies.xml unafold.xml |
diffstat | 6 files changed, 280 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Tue Jun 16 11:21:12 2015 -0400 @@ -0,0 +1,25 @@ +<?xml version="1.0"?> +<datatypes> + <datatype_files> + <datatype_file name="RNAStructure.py"/> + </datatype_files> + + <registration> + <datatype + extension="ct" + type="galaxy.datatypes.RNAStructure:ConnectivityTable" + display_in_upload="True" + subclass="True" + description="ConnectivityTable format is a text-based column wise format for storing both an RNA sequence and its corresponding 2D structure." /> + <datatype + extension="rnaml" + type="galaxy.datatypes.RNAStructure:RNAML" + display_in_upload="True" + subclass="True" + description="RNAML: a standard syntax for exchanging RNA information." + url="http://www.ncbi.nlm.nih.gov/pubmed/12088144" /> + </registration> + + <sniffers> + </sniffers> +</datatypes>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/galaxy/datatypes/RNAStructure.py Tue Jun 16 11:21:12 2015 -0400 @@ -0,0 +1,132 @@ +import logging +log = logging.getLogger(__name__) + +from galaxy import util +import galaxy +import galaxy.model +import galaxy.datatypes +import galaxy.datatypes.data + +from galaxy.datatypes.metadata import MetadataElement + +from galaxy.datatypes.sequence import Sequence +from galaxy.datatypes.tabular import Tabular +from galaxy.datatypes.xml import GenericXml + +from galaxy.datatypes.data import Data + + +import re + +class DotBracket ( Sequence ): + edam_format = "format_1457" + file_ext = "dbn" + + sequence_regexp = re.compile( "^[ACGTURYKMSWBDHVN]*" ) + structure_regexp = re.compile( "^[\(\)\.]*" ) + + def set_meta( self, dataset, **kwd ): + """ + Set the number of sequences and the number of data lines + in dataset. + """ + if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: + dataset.metadata.data_lines = None + dataset.metadata.sequences = None + dataset.metadata.seconday_structures = None + return + + data_lines = 0 + sequences = 0 + + for line in file( dataset.file_name ): + line = line.strip() + data_lines += 1 + + if line and line.startswith( '>' ): + sequences += 1 + + dataset.metadata.data_lines = data_lines + dataset.metadata.sequences = sequences + + def sniff(self, filename): + """ + The format is as follows, although it remains unclear whether + the Dot-Bracket format may contain multiple sequences per file: + + >sequenceName1 + CCCaaaGGG + (((...))) + >sequenceName2 + GGGuuuCCC + (((...))) + """ + + i = 0 + pairs = False + + with open( filename ) as handle: + for line in handle: + line = line.strip() + + state = i % 3 + + if state == 0:#header line + if(line[0] != '>'): + return False + elif state == 1:#sequence line + if not sequence_regexp.match(line.upper()): + return False + else: + sequence_size = len(line) + elif state == 2:#dot-bracket structure line + if (sequence_size != len(line)) or (not structure_regexp.match(line)): + return False + + i += 1 + return True + +class ConnectivityTable( Tabular ): + edam_format = "format_3309" + file_ext = "ct" + + header_regexp = re.compile( "^[0-9]+" + "(?:\t|[ ]+)" + "[^ \t]+") + structure_regexp = re.compile( "^[0-9]+" + "(?:\t|[ ]+)" + "[ACGTURYKMSWBDHVN]+" + "(?:\t|[ ]+)" + "[^\t]+" + "(?:\t|[ ]+)" + "[^\t]+" + "(?:\t|[ ]+)" + "[^\t]+" + "(?:\t|[ ]+)" + "[^\t]+") + + def __init__(self, **kwd): + Tabular.__init__( self, **kwd ) + + self.columns = 6 + self.column_names = ['base_index', 'base', 'neighbor_left', 'neighbor_right', 'partner', 'natural_numbering'] + self.column_types = ['int', 'str', 'int', 'int', 'int', 'int'] + + def set_meta( self, dataset, **kwd ): + data_lines = 0 + + for line in file( dataset.file_name ): + data_lines += 1 + + dataset.metadata.data_lines = data_lines + + def sniff(self, filename): + + filename = filename.file_name + + i = 0 + with open( filename ) as handle: + for line in handle: + line = line.strip() + + if(i == 0): + if not self.header_regexp.match(line): + return False + else: + if not self.structure_regexp.match(line.upper()): + return False + i += 1 + return True + + +class RNAML( GenericXml ): + edam_format = "format_3311" + file_ext = "rnaml"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1_input.fa Tue Jun 16 11:21:12 2015 -0400 @@ -0,0 +1,2 @@ +>seq1 +GGGGGaaaCCCCC \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1_output.ct Tue Jun 16 11:21:12 2015 -0400 @@ -0,0 +1,14 @@ +13 dG = -8.9 seq1 +1 G 0 2 12 1 0 2 +2 G 1 3 11 2 1 3 +3 G 2 4 10 3 2 4 +4 G 3 5 9 4 3 0 +5 G 4 6 0 5 0 0 +6 a 5 7 0 6 0 0 +7 a 6 8 0 7 0 0 +8 a 7 9 0 8 0 0 +9 C 8 10 4 9 0 10 +10 C 9 11 3 10 9 11 +11 C 10 12 2 11 10 12 +12 C 11 13 1 12 11 13 +13 C 12 0 0 13 12 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Jun 16 11:21:12 2015 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="unafold" version="3.8"> + <repository changeset_revision="9d157819abe5" name="package_unafold_3_8" owner="yhoogstrate" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/unafold.xml Tue Jun 16 11:21:12 2015 -0400 @@ -0,0 +1,101 @@ +<?xml version="1.0"?> +<tool id="unafold" name="UNAFold" version="1.0.0"> + <description>UNAFold RNA and DNA structure prediction</description> + + <requirements> + <requirement type="package" version="3.8">unafold</requirement> + </requirements> + + <stdio> + <regex match="mv: cannot stat ..: No such file or directory" source="stderr" level="fatal" description="Could not find CT output file\n" /> + </stdio> + + <version_command>UNAFold.pl --version</version_command> + + <command> + UNAFold.pl + -n $n.a + -t $temp + + #if $n.a == "DNA" + -N $sodium + -M $magnesium + #end if + + $input_file && + output=\$(ls | grep \.ct | sort -r | head -n 1) && + mv "\$output" "$output_ct" + </command> + + <inputs> + <param format="fasta" name="input_file" type="data" label="Input sequence (FASTA)"/> + + <conditional name="n"> + <param name="a" type="select" label="Nucleic Acid Type"> + <option value="RNA" selected="true">RNA</option> + <option value="DNA">DNA</option> + </param> + + <when value="RNA"> + </when> + <when value="DNA"> + <param name="sodium" type="float" size="6" value="1.0" min="0" max="100" label="[Na+] in M"/> + <param name="magnesium" type="float" size="6" value="0.0" min="0" max="100" label="[Mg++] in M"/> + </when> + </conditional> + + <param name="temp" type="integer" size="3" value="37" min="0" max="100" label="Temperature (°C)"/> + </inputs> + + <outputs> + <data format="ct" name="output_ct" label="${tool.name} on ${input_file.hid}: ${input_file.name}"/> + </outputs> + + <tests> + <test> + <param name="input_file" value="test1_input.fa" ftype="fasta" /> + <param name="temp" value="37" /> + + <output name="output_ct" file="test1_output.ct" /> + </test> + </tests> + + <help> + Usage: UNAFold.pl [options] file [file] + + Options: + -V, --version + -h, --help + -n, --NA=(RNA | DNA) (defaults to RNA) + -t, --temp=<temperature> (defaults to 37) + -N, --sodium=<[Na+] in M> (defaults to 1) + -M, --magnesium=<[Mg++] in M> (defaults to 0) + -p, --polymer + -C, --Ct=<total strand concentration> + -I, --noisolate + -m, --maxbp=<maximum basepair distance> + -c, --constraints=<name of constraints file> (defaults to prefix.aux) + -P, --percent=<energy increment percent> (defaults to 5) + -W, --window=<window size> (default set by sequence length) + -X, --max=<maximum number of foldings> (defaults to 100) + --ann=(none | p-num | ss-count) (defaults to none) + --mode=(auto | bases | lines) (defaults to auto) + --label=<base numbering frequency> + --rotate=<structure rotation angle> + --run-type=(text | html) (defaults to text) + --model=(EM | PF) (defaults to EM) + --circular + Obscure options: + --allpairs + --maxloop=<maximum bulge/interior loop size> (defaults to 30) + --nodangle + --simple + --prefilter=<filter value> + + Report bugs to markhn@rpi.edu + </help> + + <citations> + <citation type="doi">10.1007/978-1-60327-429-6_1</citation> + </citations> +</tool>