annotate COG/bac-genomics-scripts/prot_finder/prot_finder_pipe.sh @ 15:dbde253606c5 draft default tip

Uploaded
author dereeper
date Wed, 11 Dec 2024 08:25:06 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/bin/bash
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2 set -e
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 #############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 # Functions #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6 #############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8 usage () {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 cat 1>&2 << EOF # ${0##*/} parameter expansion substitution with variable '0' to get shell script filename without path
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10 Usage: ${0##*/} [OPTION] -q query.faa -f (embl|gbk) > blast_hits.tsv
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 or: ${0##*/} [OPTION] -q query.faa -s subject.faa -d result_dir \\
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12 > result_dir/blast_hits.tsv
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14 Bash wrapper script to run a pipeline consisting of optional
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 'cds_extractor.pl' (with its options '-p -f'), BLASTP, 'prot_finder.pl',
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 and optional Clustal Omega. 'cds_extractor.pl' (only for shell script
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 option '-f') and 'prot_finder.pl' either have to be installed in the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 global PATH or present in the current working directory. BLASTP is run
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 with disabled query filtering, locally optimal Smith-Waterman alignments,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 and increasing the number of database sequences to show alignments
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 to 500 for BioPerl parsing (legacy: '-F F -s T -b 500', plus: '-seg
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 no -use_sw_tback -num_alignments 500').
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 The script ends with the STDERR message 'Pipeline finished!', if this
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 is not the case have a look at the log files in the result directory
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 for errors.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 Mandatory options:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 -q <str> Path to query protein multi-FASTA file (*.faa)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 with unique FASTA IDs
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 -f <str> File extension for files in the current working
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 directory to use for 'cds_extractor.pl' (e.g.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 'embl' or 'gbk'); excludes shell script option '-s'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 or
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 -s <str> Path to subject protein multi-FASTA file (*.faa)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 already created with 'cds_extractor.pl' (and its
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 options '-p -f'), will not run 'cds_extractor.pl';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 excludes shell script option '-f'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 Optional options:
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 -h Print usage
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 -d <str> Path to result folder [default = results_i#_cq#]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 -p (legacy|plus) BLASTP suite to use [default = plus]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 -e <real> E-value for BLASTP [default = 1e-10]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 -t <int> Number of threads to be used for BLASTP and
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 Clustal Omega [default = all processors on
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 system]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 -i <int> Query identity cutoff for significant hits
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 [default = 70]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 -c <int> Query coverage cutoff [default = 70]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 -k <int> Subject coverage cutoff [default = 0]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 -b Give only best hit (highest identity) for each
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 subject sequence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 -a Multiple alignment of each multi-FASTA result
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 file with Clustal Omega
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 -o <str> Path to executable Clustal Omega binary if not
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 in global PATH; requires shell script option '-a'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 -m Clean up all non-essential files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 Author: Andreas Leimbach <aleimba[at]gmx[dot]de>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 EOF
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 ### Check external dependencies
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 check_commands () {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 which "$1" > /dev/null || err "Required executable '$1' not found in global PATH, please install.$2"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 ### Check cutoff options input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 check_cutoff_options () {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 local message="Option '-$2' requires an integer number >= 0 or <= 100 as value, not '$1'!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 [[ $1 =~ ^[0-9]+$ ]] || err "$message"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 (( $1 <= 100 )) || err "$message" # arithmetic expression (can only handle integer math, not float)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 ### Error messages
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 err () {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80 echo -e "\n### Fatal error: $*" 1>&2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 exit 1
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 ### Run status of script to STDERR instead of STDOUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 msg () {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 echo -e "# $*" 1>&2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92 # MAIN #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 shopt -s extglob # enable extended globs for bash
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 Cmdline="$*"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 ### Getopts
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 Blastp_Suite="plus"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 Evalue="1e-10"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 Threads="$(nproc --all)" # get max number of processors on system
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 Ident_Cut=70
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 Cov_Query_Cutoff=70
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 Cov_Subject_Cutoff=0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 while getopts ':q:f:s:d:p:e:t:i:c:k:bao:mh' opt; do # beginning ':' indicates silent mode, trailing ':' after each option requires value
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 case $opt in
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 q) Query_File=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 [[ -r $Query_File ]] || err "Cannot read query file '$Query_File'!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 f) Subject_Ext=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 [[ -n "$(find . -maxdepth 1 -name "*.${Subject_Ext}" -print -quit)" ]] || err "No files with the option '-f' specified file extension '$Subject_Ext' found in the current working directory!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 s) Subject_File=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 [[ -r $Subject_File ]] || err "Cannot read subject file '$Subject_File'!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118 d) Result_Dir=$OPTARG;; # checked below
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 p) Blastp_Suite=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 [[ $Blastp_Suite = @(plus|legacy) ]] || err "Option '-p' only allows 'plus' for BLASTP+ or 'legacy' for legacy BLASTP as value, not '$Blastp_Suite'!" # extended glob (regex more expensive)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 e) Evalue=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 [[ $Evalue =~ ^([0-9][0-9]*|[0-9]+e-[0-9]+)$ ]] || err "Option '-e' requires a real number (either integer or scientific exponential notation) as value, not '$Evalue'!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 t) Threads=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 [[ $Threads =~ ^[1-9][0-9]*$ ]] || err "Option '-t' requires an integer > 0 as value, not '$Threads'!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 i) Ident_Cut=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 check_cutoff_options "$Ident_Cut" "i"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 c) Cov_Query_Cutoff=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 check_cutoff_options "$Cov_Query_Cutoff" "c"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 k) Cov_Subject_Cutoff=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 check_cutoff_options "$Cov_Subject_Cutoff" "k"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 b) Opt_Best_Hit=1;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 a) Opt_Align=1;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 o) Clustal_Path=$OPTARG
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 [[ -x $Clustal_Path ]] || err "Option '-o' requires the path to an executable Clustal Omega binary as value, not '$Clustal_Path'!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 ;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 m) Opt_Clean_Up=1;;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 h) usage; exit;; # usage function, exit code zero
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 \?) err "Invalid option '-$OPTARG'. See usage with '-h'!";;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 :) err "Option '-$OPTARG' requires a value. See usage with '-h'!";;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 esac
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 done
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 ### Check options and enforce mandatory options
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 [[ $Query_File && ($Subject_Ext || $Subject_File) ]] || err "Mandatory options '-q' and '-f' or '-s' are missing!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 [[ $Subject_Ext && $Subject_File ]] && err "Options '-f' and '-s' given which exclude themselves. Choose either '-f' OR '-s'!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 (( Threads <= $(nproc) )) || err "Number of threads for option '-t', '$Threads', exceeds the maximum $(nproc) processors on the system!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 [[ ! $Opt_Align && $Clustal_Path ]] && Opt_Align=1 && msg "Option '-o' requires option '-a', forcing option '-a'!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 ### Check external dependencies
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 echo 1>&2 # newline
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 msg "Checking pipeline dependencies"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 [[ $Opt_Align && ! $Clustal_Path ]] && check_commands "clustalo" " Or use option '-o' to give the path to the binary!"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165 for exe in cds_extractor.pl formatdb blastall makeblastdb blastp prot_finder.pl; do
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 [[ $Subject_File && $exe == cds_extractor.pl ]] && continue
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 [[ $Blastp_Suite == legacy && $exe = @(makeblastdb|blastp) ]] && continue # extended glob
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 [[ $Blastp_Suite == plus && $exe = @(formatdb|blastall) ]] && continue
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 if [[ $exe = *.pl ]]; then # glob
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 if [[ -r "./$exe" ]]; then # present in current wd
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 [[ $exe =~ ^cds ]] && Cds_Extractor_Cmd="perl cds_extractor.pl"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 [[ $exe =~ ^prot ]] && Prot_Finder_Cmd="perl prot_finder.pl"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 continue
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 else
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 [[ $exe =~ ^cds ]] && Cds_Extractor_Cmd="cds_extractor.pl"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 [[ $exe =~ ^prot ]] && Prot_Finder_Cmd="prot_finder.pl"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 check_commands "$exe" " Or copy the Perl script in the current working directory."
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 fi
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 continue
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 fi
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 check_commands "$exe"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 done
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 msg "Script call command: ${0##*/} $Cmdline"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 ### Create result folder
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188 if [[ ! $Result_Dir ]]; then # can't give default before 'getopts' in case cutoffs are set by the user
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 Result_Dir="results_i${Ident_Cut}_cq${Cov_Query_Cutoff}"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190 else
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191 Result_Dir="${Result_Dir%/}" # parameter expansion substitution to get rid of a potential '/' at the end of Result_Dir path
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192 fi
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194 if [[ -d $Result_Dir ]]; then # make possible to redirect STDOUT output into result_dir (corresponding to option '-f' in 'protein_finder.pl' script)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 skip=0
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 for file in "$Result_Dir"/*; do
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197 if [[ -s $file || $skip -eq 1 ]]; then # die if a file with size > 0 or more than one file already in result_dir
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198 err "Result directory '$Result_Dir' already exists! You can use option '-d' to set a different result directory name."
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199 fi
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200 skip=1
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201 done
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202 else
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 mkdir -pv "$Result_Dir" 1>&2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 fi
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 ### Run cds_extractor.pl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208 if [[ $Subject_Ext ]]; then
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209 msg "Running cds_extractor.pl on all '*.$Subject_Ext' files in the current working directory"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210 for file in *."$Subject_Ext"; do
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 file_no_ext="${file%.${Subject_Ext}}.faa" # parameter expansion substitution to get rid of file extension and replace with new one (*.faa are the output files from cds_extractor)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 File_Names+=("$file_no_ext") # append to array
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213 eval "$Cds_Extractor_Cmd -i $file -p -f &>> $Result_Dir/cds_extractor.log" # '&>' instead of '/dev/null' for error catching
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214 done
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215 Subject_File="$Result_Dir/prot_finder.faa" # for creating BLASTP db below
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216 cat "${File_Names[@]}" > "$Subject_File" # concatenate files stored in the array, "${array[@]}" expands to list of array elements (words)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217 fi
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
218
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
219
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
220 ### Run BLASTP
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
221 msg "Running BLASTP '$Blastp_Suite' with subject '$Subject_File', query '$Query_File', evalue '$Evalue', and $Threads threads"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
222 Blast_Report="$Result_Dir/prot_finder.blastp"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
223 if [[ $Blastp_Suite == legacy ]]; then
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
224 formatdb -p T -i "$Subject_File" -n prot_finder_db
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
225 blastall -p blastp -d prot_finder_db -i "$Query_File" -o "$Blast_Report" -e "$Evalue" -F F -s T -b 500 -a "$Threads"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
226 elif [[ $Blastp_Suite == plus ]]; then
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
227 makeblastdb -in "$Subject_File" -input_type fasta -dbtype prot -out prot_finder_db &> "$Result_Dir/makeblastdb.log" # '&>' instead of '/dev/null' for error catching
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
228 blastp -db prot_finder_db -query "$Query_File" -out "$Blast_Report" -evalue "$Evalue" -seg no -use_sw_tback -num_alignments 500 -num_threads "$Threads"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
229 fi
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
230
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
231
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
232 ### Run prot_finder.pl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
233 msg "Running prot_finder.pl with identity cutoff '$Ident_Cut', query coverage cutoff '$Cov_Query_Cutoff', and subject coverage cutoff '$Cov_Subject_Cutoff'"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
234 Cmd="$Prot_Finder_Cmd -d $Result_Dir -f -q $Query_File -s $Subject_File -r $Blast_Report -i $Ident_Cut -cov_q $Cov_Query_Cutoff -cov_s $Cov_Subject_Cutoff"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
235 [[ $Opt_Best_Hit ]] && Cmd="$Cmd -b" # append to command
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
236 [[ $Opt_Align ]] && Cmd="$Cmd -a -t $Threads"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
237 [[ $Clustal_Path ]] && Cmd="$Cmd -p $Clustal_Path"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
238 eval "$Cmd" 2> "$Result_Dir/prot_finder.log" # '2>' instead of '/dev/null' for error catching
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
239
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
240 msg "All result files stored in directory '$Result_Dir'"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
241
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
242
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
243 ### Clean up non-essential files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
244 if [[ $Opt_Clean_Up ]]; then
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
245 msg "Removing non-essential output files, option '-m'"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
246 for file in "${File_Names[@]}"; do # remove output files from cds_extractor
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
247 rm -v "$file" 1>&2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
248 done
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
249 [[ $Subject_Ext ]] && rm -v "$Subject_File" 1>&2 # 'cat' from cds_extractor
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
250 if [[ $Blastp_Suite == legacy ]]; then
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
251 rm -v formatdb.log 1>&2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
252 [[ -r error.log ]] && rm -v error.log 1>&2 # no idea where this guy is coming from or what is its trigger
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
253 fi
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
254 rm -v prot_finder_db.p* "$Blast_Report" "$Result_Dir"/*.log "${Subject_File}.idx" 1>&2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
255 fi
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
256
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
257 msg "Pipeline finished!"