comparison processMAF.sh @ 2:434332033e82 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/rnacode commit b6d3800e260cdfcbe99e5f056344c3df101dad00
author rnateam
date Fri, 13 Apr 2018 07:40:08 -0400
parents
children d49b9759e294
comparison
equal deleted inserted replaced
1:7a84c6c1c4e0 2:434332033e82
1 #!/usr/bin/env bash
2 # RNAcode sometimes fails because of bugs. Since the manual suggests
3 # to call RNAcode on splitted alignments it is feasible to run
4 # RNAcode separately on the parts. This is implemented here. Command
5 # line parameters just passed on to RNAcode.
6 #
7 # - the script ensures that the region ids are continuous (otherwise
8 # the results for each block would start with 0)
9 # - also eps file names are corrected accordingly
10 # if RNAcode fails for one part it just outputs the part (for bug reporting)
11 # and continues
12
13 # for splitting the alignment you can use breakMAF.pl from the RNAcode
14 # github (it seems to be absent from the 0.3 release) and feed the output
15 # with the desired RNAcode parameters into this shell script, e.g.:
16 #
17 # breakMAF.pl < chrM.maf | ./processMAF.sh --tabular --eps --eps-dir eps2/
18
19 # parse the command line options
20 # - removes --outfile, --help, --version, and the input file from the arguments
21 # - outfile and infile are stored in variables
22 declare -a args=()
23 while [[ $# -gt 0 ]]
24 do
25 key="$1"
26 case $key in
27 -d|--eps-dir)
28 epsdir=$2
29 args+=($1 $2)
30 shift # past argument
31 shift # past value
32 ;;
33 -e|--eps)
34 eps=1
35 args+=($1)
36 shift # past argument
37 ;;
38 -g|--gtf)
39 gtf=1
40 args+=($1)
41 shift # past argument
42 ;;
43 -t|--tabular)
44 tabular=1
45 args+=($1)
46 shift # past argument
47 ;;
48 -o|--outfile)
49 outfile=$2
50 shift # past argument
51 shift # past value
52 ;;
53 -b|--best-only|-r|--best-region|-s|--stop-early)
54 args+=($1)
55 shift # past argument
56 ;;
57 -n|--num-samples|-p|--cutoff|-c|--pars|-i|--eps-cutoff)
58 args+=($1 $2)
59 shift # past argument
60 shift # past value
61 ;;
62 -h|--help|-v|--version)
63 shift # past argument
64 ;;
65 *) # unknown option
66 file=$1
67 shift # past argument
68 ;;
69 esac
70 done
71
72 # fix output (renumber blocks)
73 # and move eps files (if present) to tmpdir
74 function fix_output {
75 if [[ -z "$last" ]]; then
76 last=0
77 fi
78 while read line
79 do
80 if [[ -z "$gtf" ]]; then
81 i=`echo "$line" | sed 's/^\([[:digit:]]\+\)[[:space:]].*/\1/'`
82 else
83 i=`echo $line | sed 's/.*Gene\([[:digit:]]\+\).*/\1/'`
84 fi
85 j=`echo "$i+$last" | bc`
86 if [[ -z "$gtf" ]]; then
87 echo "$line" | sed "s/^\([[:digit:]]\+\)\([[:space:]].*\)/$j\2/"
88 else
89 echo "$line" | sed "s/^\(.*\)Gene[0-9]\+\(\".*\)$/\1Gene$j\2/"
90 fi
91 #echo $line | awk -v n=$j '{printf("%d\t", n); for(i=2; i<=NF; i++){printf("%s", $(i)); if(i==NF){printf("\n")}else{printf("\t")}}}'
92 if [[ ! -z "$eps" && -f ${epsdir:-eps}/hss-$i.eps ]]; then
93 mv ${epsdir:-eps}/hss-$i.eps $tmpd/hss-$j.eps
94 fi
95 done
96 if [[ ! -z "$j" ]]; then
97 last=`echo "$j+1" | bc`
98 unset j
99 fi
100 }
101
102 # run RNAcode for $tempfile if >= 3 sequences
103 function run_rnacode {
104 >&2 echo -e "processing " `cat ${tmpif} | grep ^s | head -n 1 | cut -d" " -f1-6`
105 # >&2 echo "with RNAcode" $@
106 nl=`cat ${tmpif} | grep "^s" | wc -l`
107 if [[ "$nl" -ge "3" ]]; then
108 # - filter the outfile for lines containing the ref and redirect everything to stderr
109 # https://github.com/wash/rnacode/issues/9
110 # - we can not pipe stdout | ... | fix_output since then $last can not be used as global variable
111 if [[ ! -z "$gtf" ]]; then
112 field=1
113 elif [[ ! -z "$tabular" ]]; then
114 field=7
115 else
116 field=6
117 fi
118 RNAcode $@ | awk -v ref=$ref -v field=$field '{if($(field)==ref){print $0}else{$0 > "/dev/stderr"}}' > ${tmpof}
119
120 if [[ "$?" != "0" ]]; then
121 ef=$(mktemp -u -p '.')
122 cat ${tmpif} > ${ef}.maf
123 >&2 echo "RNAcode failed for the alignmentblock \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\" (${ef}.maf)"
124 fi
125 fix_output < $tmpof
126 echo -n > ${tmpof}
127 else
128 >&2 echo "less than 3 sequences in the alignment block \""`cat ${tmpif} | grep $ref | cut -d" " -f 1-6`"\""
129 fi
130 }
131
132 ref=""
133 last=0
134
135 if [[ ! -z "$tabular" ]]; then
136 echo -e "HSS #\tFrame\tLength\tFrom\tTo\tName\tStart\tEnd\tScore\tP" >> ${outfile:-/dev/stdout}
137 fi
138
139 tmpif=$(mktemp -p '.')
140 tmpof=$(mktemp -p '.')
141 tmpd=$(mktemp -d -p '.')
142
143 # process lines of the alignment
144 # - save lines to tmpif
145 # - empty lines: process tmpif (ie. last alignment block) with RNAcode, clear tmpif
146 # - on the go the name of the reference species is determined from the 1st line
147 # of the alignment this is used then for filtering the RNAcode output
148 # in case of gtf output only the chromosome is printed, ie only chr1 instead of dm6.chr1
149 while read line
150 do
151 if [[ "$line" =~ ^# ]]; then
152 echo -n > ${tmpif}
153 elif [[ "$line" =~ ^$ ]]; then
154 run_rnacode ${args[@]} ${tmpif}
155 # >> ${outfile:-/dev/stdout}
156 echo -n > ${tmpif}
157 else
158 if [[ -z $ref && "$line" =~ ^s ]]; then
159 if [[ -z "$gtf" ]]; then
160 ref=`echo $line | cut -d" " -f 2`
161 else
162 ref=`echo $line | sed 's/\./ /g' | cut -d" " -f 3`
163 fi
164 fi
165 echo $line >> ${tmpif}
166 fi
167 done < ${file:-/dev/stdin}
168 # if there is something left -> process it
169 if [[ "`cat ${tmpif} | wc -l`" -gt "0" ]]; then
170 run_rnacode ${args[@]} ${tmpif}
171 # >> ${outfile:-/dev/stdout}
172 fi
173
174 if [[ ! -z "$eps" ]]; then
175 mv ${tmpd}/*eps ${epsdir:-eps}/
176 fi
177
178 rm ${tmpif}
179 rm ${tmpof}
180 rmdir ${tmpd}