Mercurial > repos > rnateam > rnacode
changeset 2:434332033e82 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/rnacode commit b6d3800e260cdfcbe99e5f056344c3df101dad00
author | rnateam |
---|---|
date | Fri, 13 Apr 2018 07:40:08 -0400 |
parents | 7a84c6c1c4e0 |
children | d49b9759e294 |
files | breakMAF.pl processMAF.sh rnacode.xml |
diffstat | 3 files changed, 544 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/breakMAF.pl Fri Apr 13 07:40:08 2018 -0400 @@ -0,0 +1,312 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; +use POSIX qw(ceil floor); + +my $maxLength = 400; +my $desiredLength = 200; +my $help = 0; + +GetOptions( + "maxLength:i" => \$maxLength, + "desiredLength:i" => \$desiredLength, + "help" => \$help, + "h" => \$help +); + +if ($help) { + print STDERR "\nbreakMAF [options] < input.maf > output.maf\n\n"; + print STDERR "Options:\n"; + print STDERR " maxLength: Break all blocks longer than that (default: 400 columns)\n"; + print STDERR " desiredLength: Try to create blocks of this size (default: 200 columns)\n"; + exit(1); +} + +$/ = 'a score='; + +while ( my $block = <> ) { + + $block = 'a score=' . $block; + + my $currAln = readMAF($block)->[0]; + + next if not defined($currAln); + + my $length = length( $currAln->[0]->{seq} ); + + if ( $length > $maxLength ) { + + my $breakN = int( $length / $desiredLength ); + my $chunkLength = ceil( $length / $breakN ); + + my $from = 0; + + while (1) { + my $to = $from + $chunkLength; + + if ( $to > $length ) { + $to = $length; + } + + my $slice = sliceAlnByColumn( $currAln, $from, $to ); + + print formatAln( $slice, 'maf' ), "\n"; + + $from = $to; + + last if $from == $length; + } + + } else { + print formatAln( $currAln, 'maf' ), "\n"; + } +} + +###################################################################### +# +# readMAF($string) +# +# Converts the MAF in string to internal alignment format. Returns +# list of usual array references. +# +###################################################################### + +sub readMAF { + + my $string = shift; + + return [] if $string eq ''; + + my @input = split( "\n", $string ); + + #open(FILE,"<$file") || die("Could not read $file ($!)"); + + my @outAlns = (); + my @aln = (); + + foreach my $i ( 0 .. $#input ) { + + $_ = $input[$i]; + + next if (/^\s?\#/); + next if (/^\s?a/); + + if (/^\s?s/) { + ( my $dummy, my $name, my $start, my $length, my $strand, my $fullLength, my $seq ) = split; + + my $end = $start + $length; + + ( my $org, my $chrom ) = split( /\./, $name ); + + push @aln, { + name => $name, + org => $org, + chrom => $chrom, + start => $start, + end => $end, + seq => $seq, + strand => $strand + }; + } + + if ( /^\s?$/ and @aln ) { + push @outAlns, [@aln]; + @aln = (); + } + if ( ( not defined $input[ $i + 1 ] ) and (@aln) ) { + push @outAlns, [@aln]; + @aln = (); + } + } + return \@outAlns; +} + +sub formatAln { + + my @aln = @{ $_[0] }; + + my $format = $_[1]; + + $format = lc($format); + + my @alnSeqs = (); + my @alnNames = (); + + my $counter = 1; + + foreach my $row (@aln) { + + my $name = "seq$counter"; + $counter++; + if ( $row->{name} ) { + $name = ( $row->{name} ); + } elsif ( $row->{org} and $row->{chrom} ) { + $name = "$row->{org}.$row->{chrom}"; + } + + my $start = $row->{start}; + my $end = $row->{end}; + my $strand = $row->{strand}; + + my $pos = ''; + + if ( defined $start + and defined $end + and defined $strand + and ( $format ne 'phylip' ) ) { + ( $start, $end ) = ( $end, $start ) if ( $strand eq '-' ); + $pos = "/$start-$end"; + } + + push @alnNames, "$name$pos"; + push @alnSeqs, $row->{seq}; + + } + + my $output = ''; + + if ( $format eq 'clustal' ) { + + $output = "CLUSTAL W(1.81) multiple sequence alignment\n\n\n"; + my $maxName = 0; + + foreach my $name (@alnNames) { + $maxName = ( $maxName < length($name) ) ? length($name) : $maxName; + } + + for my $i ( 0 .. $#alnNames ) { + my $buffer = " " x ( ( $maxName + 6 ) - length( $alnNames[$i] ) ); + $alnNames[$i] .= $buffer; + } + my $columnWidth = 60; + my $currPos = 0; + my $length = length( $alnSeqs[0] ); + + while ( $currPos < $length ) { + for my $i ( 0 .. $#alnNames ) { + $output .= $alnNames[$i]; + $output .= substr( $alnSeqs[$i], $currPos, $columnWidth ); + $output .= "\n"; + } + $output .= "\n\n"; + $currPos += $columnWidth; + } + } elsif ( $format eq 'fasta' ) { + foreach my $i ( 0 .. $#alnNames ) { + my $name = $alnNames[$i]; + my $seq = $alnSeqs[$i]; + $seq =~ s/(.{60})/$1\n/g; + $output .= ">$name\n$seq\n"; + } + } elsif ( $format eq 'phylip' ) { + my $length = length( $alnSeqs[0] ); + my $N = @alnSeqs; + $output .= " $N $length\n"; + foreach my $i ( 0 .. $#alnNames ) { + my $name = $alnNames[$i]; + my $seq = $alnSeqs[$i]; + $seq =~ s/(.{60})/$1\n/g; + $output .= "$name\n$seq\n"; + } + } elsif ( $format eq 'maf' ) { + $output .= "a score=0\n"; + foreach my $row (@aln) { + my $length = $row->{end} - $row->{start}; + $output .= "s $row->{org}.$row->{chrom} $row->{start} $length $row->{strand} 0 $row->{seq}\n"; + } + } + return $output; + +} + +###################################################################### +# +# sliceAlnByColumn(\@aln ref-to-alignment, $start int, $end int) +# +# Returns slice of alignment specified by alignment column. +# +# \@aln ... alignment in list of hash format +# $start, $end ... slice to cut +# +# Returns reference to alignment in list of hash format. This is a new +# alignment, i.e. the input is not sliced in place +# +###################################################################### + +sub sliceAlnByColumn { + + my @aln = @{ $_[0] }; + shift; + ( my $start, my $end ) = @_; + + # correct ends without warning if outside of valid range + $start = 0 if ( $start < 0 ); + $end = length( $aln[0]->{seq} ) if ( $end > length( $aln[0]->{seq} ) ); + + #my @newAln=@aln; + + # make deep copy of list of hash + my @newAln = (); + foreach (@aln) { + push @newAln, { %{$_} }; + } + + foreach my $i ( 0 .. $#newAln ) { + + my $oldStart = $newAln[$i]->{start}; + my $oldEnd = $newAln[$i]->{end}; + + $newAln[$i]->{start} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $start ); + $newAln[$i]->{end} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $end - 1 ) + 1; + $newAln[$i]->{seq} = substr( $newAln[$i]->{seq}, $start, $end - $start ); + + } + + return ( [@newAln] ); + +} + +###################################################################### +# +# alnCol2genomePos($seq string, $start int, $col int) +# +# Calculates the genomic position corresponding to a column in an +# alignment. +# +# $seq ... sequence from alignment (i.e. letters with gaps) +# $start ... Genomic position of first letter in $seq +# $col ... column in the alignment that is to be mapped +# +# Returns genomic position. No error handling, so $col must be a valid +# column of the string $seq. +# +####################################################################### + +sub alnCol2genomePos { + + ( my $seq, my $start, my $col ) = @_; + + $seq =~ s/\./-/g; #Convert all gaps to "-" + + my $newPos = $start; + + # if gap only... + return $start if ( $seq =~ /^-+$/ ); + + ( my $tmp ) = $seq =~ /(-*)[^-]/; + + my $leadingGaps = length($tmp); + + # if column is in gap before first letter, + # return position of the first letter + return $start if ( $col < $leadingGaps ); + + $newPos = $start - 1; + + for my $i ( $leadingGaps .. $col ) { + $newPos++ if ( ( my $t = substr( $seq, $i, 1 ) ) ne '-' ); + } + return $newPos; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/processMAF.sh Fri Apr 13 07:40:08 2018 -0400 @@ -0,0 +1,180 @@ +#!/usr/bin/env bash +# RNAcode sometimes fails because of bugs. Since the manual suggests +# to call RNAcode on splitted alignments it is feasible to run +# RNAcode separately on the parts. This is implemented here. Command +# line parameters just passed on to RNAcode. +# +# - the script ensures that the region ids are continuous (otherwise +# the results for each block would start with 0) +# - also eps file names are corrected accordingly +# if RNAcode fails for one part it just outputs the part (for bug reporting) +# and continues + +# for splitting the alignment you can use breakMAF.pl from the RNAcode +# github (it seems to be absent from the 0.3 release) and feed the output +# with the desired RNAcode parameters into this shell script, e.g.: +# +# breakMAF.pl < chrM.maf | ./processMAF.sh --tabular --eps --eps-dir eps2/ + +# parse the command line options +# - removes --outfile, --help, --version, and the input file from the arguments +# - outfile and infile are stored in variables +declare -a args=() +while [[ $# -gt 0 ]] +do +key="$1" + case $key in + -d|--eps-dir) + epsdir=$2 + args+=($1 $2) + shift # past argument + shift # past value + ;; + -e|--eps) + eps=1 + args+=($1) + shift # past argument + ;; + -g|--gtf) + gtf=1 + args+=($1) + shift # past argument + ;; + -t|--tabular) + tabular=1 + args+=($1) + shift # past argument + ;; + -o|--outfile) + outfile=$2 + shift # past argument + shift # past value + ;; + -b|--best-only|-r|--best-region|-s|--stop-early) + args+=($1) + shift # past argument + ;; + -n|--num-samples|-p|--cutoff|-c|--pars|-i|--eps-cutoff) + args+=($1 $2) + shift # past argument + shift # past value + ;; + -h|--help|-v|--version) + shift # past argument + ;; + *) # unknown option + file=$1 + shift # past argument + ;; + esac +done + +# fix output (renumber blocks) +# and move eps files (if present) to tmpdir +function fix_output { + if [[ -z "$last" ]]; then + last=0 + fi + while read line + do + if [[ -z "$gtf" ]]; then + i=`echo "$line" | sed 's/^\([[:digit:]]\+\)[[:space:]].*/\1/'` + else + i=`echo $line | sed 's/.*Gene\([[:digit:]]\+\).*/\1/'` + fi + j=`echo "$i+$last" | bc` + if [[ -z "$gtf" ]]; then + echo "$line" | sed "s/^\([[:digit:]]\+\)\([[:space:]].*\)/$j\2/" + else + echo "$line" | sed "s/^\(.*\)Gene[0-9]\+\(\".*\)$/\1Gene$j\2/" + fi + #echo $line | awk -v n=$j '{printf("%d\t", n); for(i=2; i<=NF; i++){printf("%s", $(i)); if(i==NF){printf("\n")}else{printf("\t")}}}' + if [[ ! -z "$eps" && -f ${epsdir:-eps}/hss-$i.eps ]]; then + mv ${epsdir:-eps}/hss-$i.eps $tmpd/hss-$j.eps + fi + done + if [[ ! -z "$j" ]]; then + last=`echo "$j+1" | bc` + unset j + fi +} + +# run RNAcode for $tempfile if >= 3 sequences +function run_rnacode { + >&2 echo -e "processing " `cat ${tmpif} | grep ^s | head -n 1 | cut -d" " -f1-6` +# >&2 echo "with RNAcode" $@ + nl=`cat ${tmpif} | grep "^s" | wc -l` + if [[ "$nl" -ge "3" ]]; then + # - filter the outfile for lines containing the ref and redirect everything to stderr + # https://github.com/wash/rnacode/issues/9 + # - we can not pipe stdout | ... | fix_output since then $last can not be used as global variable + if [[ ! -z "$gtf" ]]; then + field=1 + elif [[ ! -z "$tabular" ]]; then + field=7 + else + field=6 + fi + RNAcode $@ | awk -v ref=$ref -v field=$field '{if($(field)==ref){print $0}else{$0 > "/dev/stderr"}}' > ${tmpof} + + if [[ "$?" != "0" ]]; then + ef=$(mktemp -u -p '.') + cat ${tmpif} > ${ef}.maf + >&2 echo "RNAcode failed for the alignmentblock \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\" (${ef}.maf)" + fi + fix_output < $tmpof + echo -n > ${tmpof} + else + >&2 echo "less than 3 sequences in the alignment block \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\"" + fi +} + +ref="" +last=0 + +if [[ ! -z "$tabular" ]]; then + echo -e "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" >> ${outfile:-/dev/stdout} +fi + +tmpif=$(mktemp -p '.') +tmpof=$(mktemp -p '.') +tmpd=$(mktemp -d -p '.') + +# process lines of the alignment +# - save lines to tmpif +# - empty lines: process tmpif (ie. last alignment block) with RNAcode, clear tmpif +# - on the go the name of the reference species is determined from the 1st line +# of the alignment this is used then for filtering the RNAcode output +# in case of gtf output only the chromosome is printed, ie only chr1 instead of dm6.chr1 +while read line +do + if [[ "$line" =~ ^# ]]; then + echo -n > ${tmpif} + elif [[ "$line" =~ ^$ ]]; then + run_rnacode ${args[@]} ${tmpif} + # >> ${outfile:-/dev/stdout} + echo -n > ${tmpif} + else + if [[ -z $ref && "$line" =~ ^s ]]; then + if [[ -z "$gtf" ]]; then + ref=`echo $line | cut -d" " -f 2` + else + ref=`echo $line | sed 's/\./ /g' | cut -d" " -f 3` + fi + fi + echo $line >> ${tmpif} + fi +done < ${file:-/dev/stdin} +# if there is something left -> process it +if [[ "`cat ${tmpif} | wc -l`" -gt "0" ]]; then + run_rnacode ${args[@]} ${tmpif} + # >> ${outfile:-/dev/stdout} +fi + +if [[ ! -z "$eps" ]]; then + mv ${tmpd}/*eps ${epsdir:-eps}/ +fi + +rm ${tmpif} +rm ${tmpof} +rmdir ${tmpd}
--- a/rnacode.xml Sun Nov 12 18:16:21 2017 -0500 +++ b/rnacode.xml Fri Apr 13 07:40:08 2018 -0400 @@ -1,17 +1,26 @@ -<tool id="rbc_rnacode" name="RNAcode" version="0.3.0"> +<tool id="rbc_rnacode" name="RNAcode" version="0.3.1"> <description>Analyze the protein coding potential in MSA.</description> <requirements> <requirement type="package" version="0.3">rnacode</requirement> </requirements> - <stdio> - <exit_code range="1:" level="fatal" description="Error occurred. Please check Tool Standard Error"/> - <exit_code range=":-1" level="fatal" description="Error occurred. Please check Tool Standard Error"/> - </stdio> <version_command>RNAcode --version</version_command> - <command> + <command detect_errors="exit_code"> <![CDATA[ - RNAcode - + #if $cond_breakmaf.select_breakmaf == 'breakmaf' + '$__tool_directory__/breakMAF.pl' + --maxlength $cond_breakmaf.maxlength + --desiredLength $cond_breakmaf.desiredlength + < '$alignment' | + #else + cat '$alignment' | + #end if + + #if $cond_breakmaf.select_breakmaf == 'breakmaf' and $cond_breakmaf.processseparately == 'yes' + '$__tool_directory__/processMAF.sh' + #else + RNAcode + #end if + $outputFormat #if $cutoff and $cutoff is not None @@ -28,6 +37,7 @@ #if $cond_generateEPS.generateEPS == 'create' --eps + --eps-dir eps/ #if $cond_generateEPS.eps_cutoff and $cond_generateEPS.eps_cutoff is not None --eps-cutoff $cond_generateEPS.eps_cutoff #end if @@ -37,8 +47,6 @@ --pars "$pars" #end if - $alignment - #if $outputFormat.value == '--tabular' --outfile $outFileDefault #elif $outputFormat.value == '--gtf' @@ -49,6 +57,18 @@ </command> <inputs> <param name="alignment" type="data" format="clustal,maf" label="Multiple Alignment" help="Alignment needs to be formatted in ClustalW or MAF format"/> + <conditional name="cond_breakmaf"> + <param name="select_breakmaf" type="select" label="Break long alignment blocks" help="If your alignments contain blocks of long genomic regions it is usually not reasonable to score these long regions as a whole."> + <option value="keepmaf" selected="true">Process original alignment</option> + <option value="breakmaf">Break long alignment blocks</option> + </param> + <when value="breakmaf"> + <param argument="--maxLength" name="maxlength" type="integer" optional="true" value="400" label="maxLength" help="Break all blocks longer than that."/> + <param argument="--desiredLength" name="desiredlength" type="integer" optional="true" value="200" label="desiredLength" help="Try to create blocks of this size."/> + <param name="processseparately" type="boolean" value="false" truevalue="yes" falsevalue="no" label="process blocks separately" help="If enables RNAcode is called separately for each block. This might be a reasonable strategy if RNAcode fails for some blocks."/> + </when> + <when value="keepmaf"/> + </conditional> <param argument="--cutoff" name="cutoff" type="float" optional="true" value="1.0" label="Cutoff" help="Show only regions that have a p-value below the given number. By default all hits are shown."/> <param argument="--num_samples" name="num_samples" type="integer" optional="true" value="100" label="Number of samples" help="Number of random alignments that are sampled to calculate the p-value. RNAcode estimates the significance of a coding prediction by sampling a given number of random alignments. Default is 100 which gives reasonably stable p-values that are useful for assessing the relevance of a prediction."/> <param argument="--stop_early" name="stop_early" type="boolean" truevalue="--stop-early" falsevalue="" checked="false" label="Stop early" help="Setting this option stops the sampling process as soon as it is clear that the best hit will not fall below the given p-value cutoff. For example, assume a p-value cutoff of 0.05 (see --cutoff) and a sample size of 1000 is given (see --num-samples). As soon as 50 random samples score better than the original alignment, the process is stopped and all hits in the original alignment are reported as p>0.05 (or by convention as 1.0 in gtf and tabular output)."/> @@ -89,7 +109,7 @@ </data> <collection name="output_eps" type="list" label="Plots for ${tool.name} on ${on_string}"> <filter>cond_generateEPS['generateEPS'] == "create"</filter> - <discover_datasets pattern="(?P<designation>.*)\.eps" directory="eps" ext="eps" visible="false"/> + <discover_datasets pattern="(?P<designation>.*)\.eps" directory="eps" ext="eps"/> </collection> </outputs> <tests> @@ -101,7 +121,28 @@ <!-- sim_size is needed due to rnacode using random sampling: result files differ, better tests should be implemented --> </test> <test> + <param name="alignment" value="coding.aln"/> + <param name="generateEPS" value="nocreate"/> + <param name="outputFormat" value="--tabular"/> + <output name="outFileDefault" ftype="tabular" file="rnacode_result1.tabular" compare="sim_size"/> + <!-- sim_size is needed due to rnacode using random sampling: result files differ, better tests should be implemented --> + </test> + <test> <param name="alignment" value="coding.maf"/> + <conditional name="cond_breakmaf"> + <param name="select_breakmaf" value="breakmaf"/> + </conditional> + <param name="generateEPS" value="nocreate"/> + <param name="outputFormat" value="--gtf"/> + <output name="outFileGTF" ftype="gtf" file="rnacode_result2.gtf" compare="sim_size"/> + <!-- sim_size is needed due to rnacode using random sampling: result files differ, better tests should be implemented --> + </test> + <test> + <param name="alignment" value="coding.maf"/> + <conditional name="cond_breakmaf"> + <param name="select_breakmaf" value="breakmaf"/> + <param name="processseparately" value="yes"/> + </conditional> <param name="generateEPS" value="nocreate"/> <param name="outputFormat" value="--gtf"/> <output name="outFileGTF" ftype="gtf" file="rnacode_result2.gtf" compare="sim_size"/>