Mercurial > repos > rnateam > rnacode

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/breakMAF.pl	Fri Apr 13 07:40:08 2018 -0400
@@ -0,0 +1,312 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long;
+use POSIX qw(ceil floor);
+
+my $maxLength     = 400;
+my $desiredLength = 200;
+my $help          = 0;
+
+GetOptions(
+  "maxLength:i"     => \$maxLength,
+  "desiredLength:i" => \$desiredLength,
+  "help"            => \$help,
+  "h"               => \$help
+);
+
+if ($help) {
+  print STDERR "\nbreakMAF [options] < input.maf > output.maf\n\n";
+  print STDERR "Options:\n";
+  print STDERR "  maxLength:     Break all blocks longer than that (default: 400 columns)\n";
+  print STDERR "  desiredLength: Try to create blocks of this size (default: 200 columns)\n";
+  exit(1);
+}
+
+$/ = 'a score=';
+
+while ( my $block = <> ) {
+
+  $block = 'a score=' . $block;
+
+  my $currAln = readMAF($block)->[0];
+
+  next if not defined($currAln);
+
+  my $length = length( $currAln->[0]->{seq} );
+
+  if ( $length > $maxLength ) {
+
+    my $breakN      = int( $length / $desiredLength );
+    my $chunkLength = ceil( $length / $breakN );
+
+    my $from = 0;
+
+    while (1) {
+      my $to = $from + $chunkLength;
+
+      if ( $to > $length ) {
+        $to = $length;
+      }
+
+      my $slice = sliceAlnByColumn( $currAln, $from, $to );
+
+      print formatAln( $slice, 'maf' ), "\n";
+
+      $from = $to;
+
+      last if $from == $length;
+    }
+
+  } else {
+    print formatAln( $currAln, 'maf' ), "\n";
+  }
+}
+
+######################################################################
+#
+# readMAF($string)
+#
+# Converts the MAF in string to internal alignment format. Returns
+# list of usual array references.
+#
+######################################################################
+
+sub readMAF {
+
+  my $string = shift;
+
+  return [] if $string eq '';
+
+  my @input = split( "\n", $string );
+
+  #open(FILE,"<$file") || die("Could not read $file ($!)");
+
+  my @outAlns = ();
+  my @aln     = ();
+
+  foreach my $i ( 0 .. $#input ) {
+
+    $_ = $input[$i];
+
+    next if (/^\s?\#/);
+    next if (/^\s?a/);
+
+    if (/^\s?s/) {
+      ( my $dummy, my $name, my $start, my $length, my $strand, my $fullLength, my $seq ) = split;
+
+      my $end = $start + $length;
+
+      ( my $org, my $chrom ) = split( /\./, $name );
+
+      push @aln, {
+        name   => $name,
+        org    => $org,
+        chrom  => $chrom,
+        start  => $start,
+        end    => $end,
+        seq    => $seq,
+        strand => $strand
+        };
+    }
+
+    if ( /^\s?$/ and @aln ) {
+      push @outAlns, [@aln];
+      @aln = ();
+    }
+    if ( ( not defined $input[ $i + 1 ] ) and (@aln) ) {
+      push @outAlns, [@aln];
+      @aln = ();
+    }
+  }
+  return \@outAlns;
+}
+
+sub formatAln {
+
+  my @aln = @{ $_[0] };
+
+  my $format = $_[1];
+
+  $format = lc($format);
+
+  my @alnSeqs  = ();
+  my @alnNames = ();
+
+  my $counter = 1;
+
+  foreach my $row (@aln) {
+
+    my $name = "seq$counter";
+    $counter++;
+    if ( $row->{name} ) {
+      $name = ( $row->{name} );
+    } elsif ( $row->{org} and $row->{chrom} ) {
+      $name = "$row->{org}.$row->{chrom}";
+    }
+
+    my $start  = $row->{start};
+    my $end    = $row->{end};
+    my $strand = $row->{strand};
+
+    my $pos = '';
+
+    if (  defined $start
+      and defined $end
+      and defined $strand
+      and ( $format ne 'phylip' ) ) {
+      ( $start, $end ) = ( $end, $start ) if ( $strand eq '-' );
+      $pos = "/$start-$end";
+    }
+
+    push @alnNames, "$name$pos";
+    push @alnSeqs,  $row->{seq};
+
+  }
+
+  my $output = '';
+
+  if ( $format eq 'clustal' ) {
+
+    $output = "CLUSTAL W(1.81) multiple sequence alignment\n\n\n";
+    my $maxName = 0;
+
+    foreach my $name (@alnNames) {
+      $maxName = ( $maxName < length($name) ) ? length($name) : $maxName;
+    }
+
+    for my $i ( 0 .. $#alnNames ) {
+      my $buffer = " " x ( ( $maxName + 6 ) - length( $alnNames[$i] ) );
+      $alnNames[$i] .= $buffer;
+    }
+    my $columnWidth = 60;
+    my $currPos     = 0;
+    my $length      = length( $alnSeqs[0] );
+
+    while ( $currPos < $length ) {
+      for my $i ( 0 .. $#alnNames ) {
+        $output .= $alnNames[$i];
+        $output .= substr( $alnSeqs[$i], $currPos, $columnWidth );
+        $output .= "\n";
+      }
+      $output .= "\n\n";
+      $currPos += $columnWidth;
+    }
+  } elsif ( $format eq 'fasta' ) {
+    foreach my $i ( 0 .. $#alnNames ) {
+      my $name = $alnNames[$i];
+      my $seq  = $alnSeqs[$i];
+      $seq =~ s/(.{60})/$1\n/g;
+      $output .= ">$name\n$seq\n";
+    }
+  } elsif ( $format eq 'phylip' ) {
+    my $length = length( $alnSeqs[0] );
+    my $N      = @alnSeqs;
+    $output .= "  $N $length\n";
+    foreach my $i ( 0 .. $#alnNames ) {
+      my $name = $alnNames[$i];
+      my $seq  = $alnSeqs[$i];
+      $seq =~ s/(.{60})/$1\n/g;
+      $output .= "$name\n$seq\n";
+    }
+  } elsif ( $format eq 'maf' ) {
+    $output .= "a score=0\n";
+    foreach my $row (@aln) {
+      my $length = $row->{end} - $row->{start};
+      $output .= "s $row->{org}.$row->{chrom} $row->{start} $length $row->{strand} 0 $row->{seq}\n";
+    }
+  }
+  return $output;
+
+}
+
+######################################################################
+#
+# sliceAlnByColumn(\@aln ref-to-alignment, $start int, $end int)
+#
+# Returns slice of alignment specified by alignment column.
+#
+# \@aln ... alignment in list of hash format
+# $start, $end ... slice to cut
+#
+# Returns reference to alignment in list of hash format. This is a new
+# alignment, i.e. the input is not sliced in place
+#
+######################################################################
+
+sub sliceAlnByColumn {
+
+  my @aln = @{ $_[0] };
+  shift;
+  ( my $start, my $end ) = @_;
+
+  # correct ends without warning if outside of valid range
+  $start = 0 if ( $start < 0 );
+  $end = length( $aln[0]->{seq} ) if ( $end > length( $aln[0]->{seq} ) );
+
+  #my @newAln=@aln;
+
+  # make deep copy of list of hash
+  my @newAln = ();
+  foreach (@aln) {
+    push @newAln, { %{$_} };
+  }
+
+  foreach my $i ( 0 .. $#newAln ) {
+
+    my $oldStart = $newAln[$i]->{start};
+    my $oldEnd   = $newAln[$i]->{end};
+
+    $newAln[$i]->{start} = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $start );
+    $newAln[$i]->{end}   = alnCol2genomePos( $newAln[$i]->{seq}, $oldStart, $end - 1 ) + 1;
+    $newAln[$i]->{seq} = substr( $newAln[$i]->{seq}, $start, $end - $start );
+
+  }
+
+  return ( [@newAln] );
+
+}
+
+######################################################################
+#
+# alnCol2genomePos($seq string, $start int, $col int)
+#
+# Calculates the genomic position corresponding to a column in an
+# alignment.
+#
+# $seq ... sequence from alignment (i.e. letters with gaps)
+# $start ... Genomic position of first letter in $seq
+# $col ... column in the alignment that is to be mapped
+#
+# Returns genomic position. No error handling, so $col must be a valid
+# column of the string $seq.
+#
+#######################################################################
+
+sub alnCol2genomePos {
+
+  ( my $seq, my $start, my $col ) = @_;
+
+  $seq =~ s/\./-/g;    #Convert all gaps to "-"
+
+  my $newPos = $start;
+
+  # if gap only...
+  return $start if ( $seq =~ /^-+$/ );
+
+  ( my $tmp ) = $seq =~ /(-*)[^-]/;
+
+  my $leadingGaps = length($tmp);
+
+  # if column is in gap before first letter,
+  # return position of the first letter
+  return $start if ( $col < $leadingGaps );
+
+  $newPos = $start - 1;
+
+  for my $i ( $leadingGaps .. $col ) {
+    $newPos++ if ( ( my $t = substr( $seq, $i, 1 ) ) ne '-' );
+  }
+  return $newPos;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/processMAF.sh	Fri Apr 13 07:40:08 2018 -0400
@@ -0,0 +1,180 @@
+#!/usr/bin/env bash
+# RNAcode sometimes fails because of bugs. Since the manual suggests
+# to call RNAcode on splitted alignments it is feasible to run
+# RNAcode separately on the parts. This is implemented here. Command
+# line parameters just passed on to RNAcode.
+#
+# - the script ensures that the region ids are continuous (otherwise
+# the results for each block would start with 0)
+# - also eps file names are corrected accordingly
+# if RNAcode fails for one part it just outputs the part (for bug reporting)
+# and continues
+
+# for splitting the alignment you can use breakMAF.pl from the RNAcode
+# github (it seems to be absent from the 0.3 release) and feed the output
+# with the desired RNAcode parameters into this shell script, e.g.:
+#
+# breakMAF.pl < chrM.maf | ./processMAF.sh --tabular --eps --eps-dir eps2/
+
+# parse the command line options
+# - removes --outfile, --help, --version, and the input file from the arguments
+# - outfile and infile are stored in variables
+declare -a args=()
+while [[ $# -gt 0 ]]
+do
+key="$1"
+	case $key in
+		-d|--eps-dir)
+		epsdir=$2
+		args+=($1 $2)
+		shift # past argument
+		shift # past value
+		;;
+		-e|--eps)
+		eps=1
+		args+=($1)
+		shift # past argument
+		;;
+		-g|--gtf)
+		gtf=1
+		args+=($1)
+		shift # past argument
+		;;
+		-t|--tabular)
+		tabular=1
+		args+=($1)
+		shift # past argument
+		;;
+		-o|--outfile)
+		outfile=$2
+		shift # past argument
+		shift # past value
+		;;
+		-b|--best-only|-r|--best-region|-s|--stop-early)
+		args+=($1)
+		shift # past argument
+		;;
+		-n|--num-samples|-p|--cutoff|-c|--pars|-i|--eps-cutoff)
+		args+=($1 $2)
+		shift # past argument
+		shift # past value
+		;;
+		-h|--help|-v|--version)
+		shift # past argument
+		;;
+		*)    # unknown option
+		file=$1
+		shift # past argument
+		;;
+	esac
+done
+
+# fix output (renumber blocks)
+# and move eps files (if present) to tmpdir
+function fix_output {
+ 	if [[ -z "$last" ]]; then
+ 		last=0
+ 	fi
+	while read line
+ 	do
+		if [[ -z "$gtf" ]]; then
+			i=`echo "$line" | sed 's/^\([[:digit:]]\+\)[[:space:]].*/\1/'`
+		else
+			i=`echo $line | sed 's/.*Gene\([[:digit:]]\+\).*/\1/'`
+		fi
+ 		j=`echo "$i+$last" | bc`
+		if [[ -z "$gtf" ]]; then
+			echo "$line" | sed "s/^\([[:digit:]]\+\)\([[:space:]].*\)/$j\2/"
+		else
+			echo "$line" | sed "s/^\(.*\)Gene[0-9]\+\(\".*\)$/\1Gene$j\2/"
+		fi
+		#echo $line | awk -v n=$j '{printf("%d\t", n); for(i=2; i<=NF; i++){printf("%s", $(i)); if(i==NF){printf("\n")}else{printf("\t")}}}'
+		if [[ ! -z "$eps" && -f ${epsdir:-eps}/hss-$i.eps ]]; then
+			mv ${epsdir:-eps}/hss-$i.eps $tmpd/hss-$j.eps
+		fi
+	done
+ 	if [[ ! -z "$j" ]]; then
+ 		last=`echo "$j+1" | bc`
+ 		unset j
+ 	fi
+}
+
+# run RNAcode for $tempfile if >= 3 sequences
+function run_rnacode {
+	>&2 echo -e "processing " `cat ${tmpif} | grep ^s | head -n 1 | cut -d" " -f1-6`
+# 	>&2 echo "with RNAcode" $@
+ 	nl=`cat ${tmpif} | grep "^s" | wc -l`
+	if [[ "$nl" -ge "3" ]]; then
+		# - filter the outfile for lines containing the ref and redirect everything to stderr
+		#   https://github.com/wash/rnacode/issues/9
+		# - we can not pipe stdout | ... | fix_output since then $last can not be used as global variable
+		if [[ ! -z "$gtf" ]]; then
+			field=1
+		elif [[ ! -z "$tabular" ]]; then
+			field=7
+		else
+			field=6
+		fi
+		RNAcode $@ | awk -v ref=$ref -v field=$field '{if($(field)==ref){print $0}else{$0 > "/dev/stderr"}}' > ${tmpof}
+
+		if [[ "$?" != "0" ]]; then
+			ef=$(mktemp -u -p '.')
+			cat ${tmpif} > ${ef}.maf
+			>&2 echo "RNAcode failed for the alignmentblock \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\" (${ef}.maf)"
+		fi
+		fix_output < $tmpof
+		echo -n > ${tmpof}
+	else
+		>&2 echo "less than 3 sequences in the alignment block \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\""
+	fi
+}
+
+ref=""
+last=0
+
+if [[ ! -z "$tabular" ]]; then
+	echo -e "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" >> ${outfile:-/dev/stdout}
+fi
+
+tmpif=$(mktemp -p '.')
+tmpof=$(mktemp -p '.')
+tmpd=$(mktemp -d -p '.')
+
+# process lines of the alignment
+# - save lines to tmpif
+# - empty lines: process tmpif (ie. last alignment block) with RNAcode, clear tmpif
+# - on the go the name of the reference species is determined from the 1st line
+#   of the alignment this is used then for filtering the RNAcode output
+#   in case of gtf output only the chromosome is printed, ie only chr1 instead of dm6.chr1
+while read line
+do
+	if [[ "$line" =~ ^# ]]; then
+		echo -n > ${tmpif}
+	elif [[ "$line" =~ ^$ ]]; then
+		run_rnacode ${args[@]} ${tmpif}
+		# >> ${outfile:-/dev/stdout}
+		echo -n > ${tmpif}
+	else
+                if [[ -z $ref && "$line" =~ ^s ]]; then
+			if [[ -z "$gtf" ]]; then
+				ref=`echo $line | cut -d" " -f 2`
+			else
+				ref=`echo $line | sed 's/\./ /g' | cut -d" " -f 3`
+			fi
+		fi
+		echo $line >> ${tmpif}
+	fi
+done < ${file:-/dev/stdin}
+# if there is something left -> process it
+if [[ "`cat ${tmpif} | wc -l`" -gt "0" ]]; then
+	run_rnacode ${args[@]} ${tmpif}
+       #	>> ${outfile:-/dev/stdout}
+fi
+
+if [[ ! -z "$eps" ]]; then
+	mv ${tmpd}/*eps ${epsdir:-eps}/
+fi
+
+rm ${tmpif}
+rm ${tmpof}
+rmdir ${tmpd}
--- a/rnacode.xml	Sun Nov 12 18:16:21 2017 -0500
+++ b/rnacode.xml	Fri Apr 13 07:40:08 2018 -0400
@@ -1,17 +1,26 @@
-<tool id="rbc_rnacode" name="RNAcode" version="0.3.0">
+<tool id="rbc_rnacode" name="RNAcode" version="0.3.1">
     <description>Analyze the protein coding potential in MSA.</description>
     <requirements>
         <requirement type="package" version="0.3">rnacode</requirement>
     </requirements>
-    <stdio>
-        <exit_code range="1:"  level="fatal" description="Error occurred. Please check Tool Standard Error"/>
-        <exit_code range=":-1" level="fatal" description="Error occurred. Please check Tool Standard Error"/>
-    </stdio>
     <version_command>RNAcode --version</version_command>
-    <command>
+    <command detect_errors="exit_code">
     <![CDATA[
-        RNAcode
-
+        #if $cond_breakmaf.select_breakmaf == 'breakmaf'
+            '$__tool_directory__/breakMAF.pl'
+            --maxlength $cond_breakmaf.maxlength
+            --desiredLength $cond_breakmaf.desiredlength
+            < '$alignment' |
+        #else
+            cat '$alignment' |
+        #end if
+
+        #if $cond_breakmaf.select_breakmaf == 'breakmaf' and $cond_breakmaf.processseparately == 'yes'
+            '$__tool_directory__/processMAF.sh'
+        #else
+            RNAcode
+        #end if
+
         $outputFormat

         #if $cutoff and $cutoff is not None
@@ -28,6 +37,7 @@

         #if $cond_generateEPS.generateEPS == 'create'
             --eps
+            --eps-dir eps/
             #if $cond_generateEPS.eps_cutoff and $cond_generateEPS.eps_cutoff is not None
                 --eps-cutoff $cond_generateEPS.eps_cutoff
             #end if
@@ -37,8 +47,6 @@
             --pars "$pars"
         #end if

-        $alignment
-
         #if $outputFormat.value == '--tabular'
             --outfile $outFileDefault
         #elif $outputFormat.value == '--gtf'
@@ -49,6 +57,18 @@
     </command>
     <inputs>
         <param name="alignment" type="data" format="clustal,maf" label="Multiple Alignment" help="Alignment needs to be formatted in ClustalW or MAF format"/>
+        <conditional name="cond_breakmaf">
+            <param name="select_breakmaf" type="select" label="Break long alignment blocks" help="If your alignments contain blocks of long genomic regions it is usually not reasonable to score these long regions as a whole.">
+                <option value="keepmaf" selected="true">Process original alignment</option>
+                <option value="breakmaf">Break long alignment blocks</option>
+            </param>
+            <when value="breakmaf">
+                <param argument="--maxLength" name="maxlength" type="integer" optional="true" value="400" label="maxLength" help="Break all blocks longer than that."/>
+                <param argument="--desiredLength" name="desiredlength" type="integer" optional="true" value="200" label="desiredLength" help="Try to create blocks of this size."/>
+                <param name="processseparately" type="boolean" value="false" truevalue="yes" falsevalue="no" label="process blocks separately" help="If enables RNAcode is called separately for each block. This might be a reasonable strategy if RNAcode fails for some blocks."/>
+            </when>
+            <when value="keepmaf"/>
+        </conditional>
         <param argument="--cutoff" name="cutoff" type="float" optional="true" value="1.0" label="Cutoff" help="Show only regions that have a p-value below the given number. By default all hits are shown."/>
         <param argument="--num_samples" name="num_samples" type="integer" optional="true" value="100" label="Number of samples" help="Number of random alignments that are sampled to calculate the p-value. RNAcode estimates the significance of a coding prediction by sampling a given number of random alignments. Default is 100 which gives reasonably stable p-values that are useful for assessing the relevance of a prediction."/>
         <param argument="--stop_early" name="stop_early" type="boolean" truevalue="--stop-early" falsevalue="" checked="false" label="Stop early" help="Setting this option stops the sampling process as soon as it is clear that the best hit will not fall below the given p-value cutoff. For example, assume a p-value cutoff of 0.05 (see --cutoff) and a sample size of 1000 is given (see --num-samples). As soon as 50 random samples score better than the original alignment, the process is stopped and all hits in the original alignment are reported as p>0.05 (or by convention as 1.0 in gtf and tabular output)."/>
@@ -89,7 +109,7 @@
         </data>
         <collection name="output_eps" type="list" label="Plots for ${tool.name} on ${on_string}">
             <filter>cond_generateEPS['generateEPS'] == "create"</filter>
-            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.eps" directory="eps" ext="eps" visible="false"/>
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.eps" directory="eps" ext="eps"/>
         </collection>
     </outputs>
     <tests>
@@ -101,7 +121,28 @@
             <!-- sim_size is needed due to rnacode using random sampling: result files differ, better tests should be implemented -->
         </test>
         <test>
+            <param name="alignment" value="coding.aln"/>
+            <param name="generateEPS" value="nocreate"/>
+            <param name="outputFormat" value="--tabular"/>
+            <output name="outFileDefault" ftype="tabular" file="rnacode_result1.tabular" compare="sim_size"/>
+            <!-- sim_size is needed due to rnacode using random sampling: result files differ, better tests should be implemented -->
+        </test>
+        <test>
             <param name="alignment" value="coding.maf"/>
+            <conditional name="cond_breakmaf">
+                <param name="select_breakmaf" value="breakmaf"/>
+            </conditional>
+            <param name="generateEPS" value="nocreate"/>
+            <param name="outputFormat" value="--gtf"/>
+            <output name="outFileGTF" ftype="gtf" file="rnacode_result2.gtf" compare="sim_size"/>
+            <!-- sim_size is needed due to rnacode using random sampling: result files differ, better tests should be implemented -->
+        </test>
+        <test>
+            <param name="alignment" value="coding.maf"/>
+            <conditional name="cond_breakmaf">
+                <param name="select_breakmaf" value="breakmaf"/>
+                <param name="processseparately" value="yes"/>
+            </conditional>
             <param name="generateEPS" value="nocreate"/>
             <param name="outputFormat" value="--gtf"/>
             <output name="outFileGTF" ftype="gtf" file="rnacode_result2.gtf" compare="sim_size"/>