Mercurial > repos > rnateam > rnacode
diff processMAF.sh @ 2:434332033e82 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/rnacode commit b6d3800e260cdfcbe99e5f056344c3df101dad00
author | rnateam |
---|---|
date | Fri, 13 Apr 2018 07:40:08 -0400 |
parents | |
children | d49b9759e294 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/processMAF.sh Fri Apr 13 07:40:08 2018 -0400 @@ -0,0 +1,180 @@ +#!/usr/bin/env bash +# RNAcode sometimes fails because of bugs. Since the manual suggests +# to call RNAcode on splitted alignments it is feasible to run +# RNAcode separately on the parts. This is implemented here. Command +# line parameters just passed on to RNAcode. +# +# - the script ensures that the region ids are continuous (otherwise +# the results for each block would start with 0) +# - also eps file names are corrected accordingly +# if RNAcode fails for one part it just outputs the part (for bug reporting) +# and continues + +# for splitting the alignment you can use breakMAF.pl from the RNAcode +# github (it seems to be absent from the 0.3 release) and feed the output +# with the desired RNAcode parameters into this shell script, e.g.: +# +# breakMAF.pl < chrM.maf | ./processMAF.sh --tabular --eps --eps-dir eps2/ + +# parse the command line options +# - removes --outfile, --help, --version, and the input file from the arguments +# - outfile and infile are stored in variables +declare -a args=() +while [[ $# -gt 0 ]] +do +key="$1" + case $key in + -d|--eps-dir) + epsdir=$2 + args+=($1 $2) + shift # past argument + shift # past value + ;; + -e|--eps) + eps=1 + args+=($1) + shift # past argument + ;; + -g|--gtf) + gtf=1 + args+=($1) + shift # past argument + ;; + -t|--tabular) + tabular=1 + args+=($1) + shift # past argument + ;; + -o|--outfile) + outfile=$2 + shift # past argument + shift # past value + ;; + -b|--best-only|-r|--best-region|-s|--stop-early) + args+=($1) + shift # past argument + ;; + -n|--num-samples|-p|--cutoff|-c|--pars|-i|--eps-cutoff) + args+=($1 $2) + shift # past argument + shift # past value + ;; + -h|--help|-v|--version) + shift # past argument + ;; + *) # unknown option + file=$1 + shift # past argument + ;; + esac +done + +# fix output (renumber blocks) +# and move eps files (if present) to tmpdir +function fix_output { + if [[ -z "$last" ]]; then + last=0 + fi + while read line + do + if [[ -z "$gtf" ]]; then + i=`echo "$line" | sed 's/^\([[:digit:]]\+\)[[:space:]].*/\1/'` + else + i=`echo $line | sed 's/.*Gene\([[:digit:]]\+\).*/\1/'` + fi + j=`echo "$i+$last" | bc` + if [[ -z "$gtf" ]]; then + echo "$line" | sed "s/^\([[:digit:]]\+\)\([[:space:]].*\)/$j\2/" + else + echo "$line" | sed "s/^\(.*\)Gene[0-9]\+\(\".*\)$/\1Gene$j\2/" + fi + #echo $line | awk -v n=$j '{printf("%d\t", n); for(i=2; i<=NF; i++){printf("%s", $(i)); if(i==NF){printf("\n")}else{printf("\t")}}}' + if [[ ! -z "$eps" && -f ${epsdir:-eps}/hss-$i.eps ]]; then + mv ${epsdir:-eps}/hss-$i.eps $tmpd/hss-$j.eps + fi + done + if [[ ! -z "$j" ]]; then + last=`echo "$j+1" | bc` + unset j + fi +} + +# run RNAcode for $tempfile if >= 3 sequences +function run_rnacode { + >&2 echo -e "processing " `cat ${tmpif} | grep ^s | head -n 1 | cut -d" " -f1-6` +# >&2 echo "with RNAcode" $@ + nl=`cat ${tmpif} | grep "^s" | wc -l` + if [[ "$nl" -ge "3" ]]; then + # - filter the outfile for lines containing the ref and redirect everything to stderr + # https://github.com/wash/rnacode/issues/9 + # - we can not pipe stdout | ... | fix_output since then $last can not be used as global variable + if [[ ! -z "$gtf" ]]; then + field=1 + elif [[ ! -z "$tabular" ]]; then + field=7 + else + field=6 + fi + RNAcode $@ | awk -v ref=$ref -v field=$field '{if($(field)==ref){print $0}else{$0 > "/dev/stderr"}}' > ${tmpof} + + if [[ "$?" != "0" ]]; then + ef=$(mktemp -u -p '.') + cat ${tmpif} > ${ef}.maf + >&2 echo "RNAcode failed for the alignmentblock \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\" (${ef}.maf)" + fi + fix_output < $tmpof + echo -n > ${tmpof} + else + >&2 echo "less than 3 sequences in the alignment block \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\"" + fi +} + +ref="" +last=0 + +if [[ ! -z "$tabular" ]]; then + echo -e "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" >> ${outfile:-/dev/stdout} +fi + +tmpif=$(mktemp -p '.') +tmpof=$(mktemp -p '.') +tmpd=$(mktemp -d -p '.') + +# process lines of the alignment +# - save lines to tmpif +# - empty lines: process tmpif (ie. last alignment block) with RNAcode, clear tmpif +# - on the go the name of the reference species is determined from the 1st line +# of the alignment this is used then for filtering the RNAcode output +# in case of gtf output only the chromosome is printed, ie only chr1 instead of dm6.chr1 +while read line +do + if [[ "$line" =~ ^# ]]; then + echo -n > ${tmpif} + elif [[ "$line" =~ ^$ ]]; then + run_rnacode ${args[@]} ${tmpif} + # >> ${outfile:-/dev/stdout} + echo -n > ${tmpif} + else + if [[ -z $ref && "$line" =~ ^s ]]; then + if [[ -z "$gtf" ]]; then + ref=`echo $line | cut -d" " -f 2` + else + ref=`echo $line | sed 's/\./ /g' | cut -d" " -f 3` + fi + fi + echo $line >> ${tmpif} + fi +done < ${file:-/dev/stdin} +# if there is something left -> process it +if [[ "`cat ${tmpif} | wc -l`" -gt "0" ]]; then + run_rnacode ${args[@]} ${tmpif} + # >> ${outfile:-/dev/stdout} +fi + +if [[ ! -z "$eps" ]]; then + mv ${tmpd}/*eps ${epsdir:-eps}/ +fi + +rm ${tmpif} +rm ${tmpof} +rmdir ${tmpd}