Mercurial > repos > aafc-mbb > itsx
comparison ITSx @ 0:57e76d405a97 draft default tip
Uploaded
| author | aafc-mbb | 
|---|---|
| date | Mon, 14 Mar 2016 16:15:08 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:57e76d405a97 | 
|---|---|
| 1 #!/usr/bin/perl | |
| 2 # ITSx ITS Extractor | |
| 3 $app_title = "ITSx -- Identifies ITS sequences and extracts the ITS region"; | |
| 4 $app_author = "Johan Bengtsson-Palme et al., University of Gothenburg"; | |
| 5 $app_version = "1.0.11"; | |
| 6 $app_message = ""; | |
| 7 # ----------------------------------------------------------------- # | |
| 8 | |
| 9 # License information | |
| 10 $license = | |
| 11 " ITSx - ITS Extractor -- Identifies ITS sequences and extracts the ITS region\ | |
| 12 Copyright (C) 2012-2014 Johan Bengtsson-Palme et al.\ | |
| 13 \ | |
| 14 This program is free software: you can redistribute it and/or modify\ | |
| 15 it under the terms of the GNU General Public License as published by\ | |
| 16 the Free Software Foundation, either version 3 of the License, or\ | |
| 17 (at your option) any later version.\ | |
| 18 \ | |
| 19 This program is distributed in the hope that it will be useful,\ | |
| 20 but WITHOUT ANY WARRANTY; without even the implied warranty of\ | |
| 21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\ | |
| 22 GNU General Public License for more details.\ | |
| 23 \ | |
| 24 You should have received a copy of the GNU General Public License\ | |
| 25 along with this program, in a file called 'license.txt'\ | |
| 26 If not, see: http://www.gnu.org/licenses/.\ | |
| 27 "; | |
| 28 | |
| 29 ## BUGS: | |
| 30 $bugs = "New features in this version ($app_version):\ | |
| 31 - None\ | |
| 32 \ | |
| 33 Fixed bugs in this version ($app_version):\ | |
| 34 - Fixed a bug causing newline characters to be occasionally skipped in the 'its1.full_and_partial.fasta' FASTA output file when the '--anchor' option was used\ | |
| 35 \ | |
| 36 Known bugs in this version ($app_version):\ | |
| 37 - None\ | |
| 38 "; | |
| 39 | |
| 40 ## OPTIONS: | |
| 41 $options = "\ | |
| 42 -i {file} : DNA FASTA input file to investigate\ | |
| 43 -o {file} : Base for the names of output file(s)\ | |
| 44 -p {directory} : A path to a directory of HMM-profile collections representing ITS conserved regions, default is in the same directory as ITSx itself\ | |
| 45 --date {T or F} : Adds a date and time stamp to the output directory, off (F) by default\ | |
| 46 --reset {T or F} : Re-creates the HMM-database before ITSx is run, off (F) by default\ | |
| 47 | |
| 48 Sequence selection options:\ | |
| 49 -t {character code} : Profile set to use for the search, see the User's Guide (comma-separated), default is all\ | |
| 50 -E {value} : Domain E-value cutoff for a sequence to be included in the output, default = 1e-5\ | |
| 51 -S {value} : Domain score cutoff for a sequence to be included in the output, default = 0\ | |
| 52 -N {value} : The minimal number of domains that must match a sequence before it is included, default = 2\ | |
| 53 --selection_priority {sum, domains, eval, score} : Selects what will be of highest priority when determining the origin of the sequence, default is sum\ | |
| 54 --search_eval {value} : The E-value cutoff used in the HMMER search, high numbers may slow down the process, cannot be used with the --search_score option, default is 0.01\ | |
| 55 --search_score {value} : The score cutoff used in the HMMER search, low numbers may slow down the process, cannot be used with the --search_eval option, default is to used E-value cutoff, not score\ | |
| 56 --allow_single_domain {e-value,score or F} : Allow inclusion of sequences that only find a single domain, given that they meet the given E-value and score thresholds, on with parameters 1e-9,0 by default\ | |
| 57 --allow_reorder {T or F} : Allows profiles to be in the wrong order on extracted sequences, off (F) by default\ | |
| 58 --complement {T or F} : Checks both DNA strands against the database, creating reverse complements, on (T) by default\ | |
| 59 --cpu {value} : the number of CPU threads to use, default is 1\ | |
| 60 --multi_thread {T or F} : Multi-thread the HMMER-search, on (T) if number of CPUs (--cpu option > 1), else off (F) by default\ | |
| 61 --heuristics {T or F} : Selects whether to use HMMER's heuristic filtering, off (F) by default\ | |
| 62 | |
| 63 Output options:\ | |
| 64 --summary {T or F} : Summary of results output, on (T) by default\ | |
| 65 --graphical {T or F} : 'Graphical' output, on (T) by default\ | |
| 66 --fasta {T or F} : FASTA-format output of extracted ITS sequences, on (T) by default\ | |
| 67 --preserve {T or F} : Preserve sequence headers in input file instead of printing out ITSx headers, off (F) by default\ | |
| 68 --save_regions {SSU,ITS1,5.8S,ITS2,LSU,all,none} : A comma separated list of regions to output separate FASTA files for, 'ITS1,ITS2' by default\ | |
| 69 --anchor {integer or HMM} : Saves an additional number of bases before and after each extracted region. If set to 'HMM' all bases matching the corresponding HMM will be output, default = 0\ | |
| 70 --only_full {T or F} : If true, output is limited to full-length regions, off (F) by default\ | |
| 71 --partial {integer} : Saves additional FASTA-files for full and partial ITS sequences longer than the specified cutoff, default = 0 (off)\ | |
| 72 --concat {T or F} : Saves a FASTA-file with concatenated ITS sequences (with 5.8S removed), off (F) by default\ | |
| 73 --minlen {integer} : Minimum length the ITS regions must be to be outputted in the concatenated file (see above), default = 0\ | |
| 74 --positions {T or F} : Table format output containing the positions ITS sequences were found in, on (T) by default\ | |
| 75 --table {T or F} : Table format output of sequences containing probable ITS sequences, off (F) by default\ | |
| 76 --not_found {T or F} : Saves a list of non-found entries, on (T) by default\ | |
| 77 --detailed_results {T or F} : Saves a tab-separated list of all results, off (F) by default\ | |
| 78 --truncate {T or F} : Truncates the FASTA output to only contain the actual ITS sequences found, on (T) by default\ | |
| 79 --silent {T or F} : Supresses printing progress info to stderr, off (F) by default\ | |
| 80 --graph_scale {value} : Sets the scale of the graph output, if value is zero, a percentage view is shown, default = 0\ | |
| 81 --save_raw {T or F} : Saves all raw data for searches etc. instead of removing it on finish, off (F) by default\ | |
| 82 | |
| 83 -h : displays this help message\ | |
| 84 --help : displays this help message\ | |
| 85 --bugs : displays the bug fixes and known bugs in this version of ITSx\ | |
| 86 --license : displays licensing information\ | |
| 87 "; | |
| 88 | |
| 89 | |
| 90 ## Print title message | |
| 91 print STDERR "$app_title\nby $app_author\nVersion: $app_version\n$app_message"; | |
| 92 print STDERR "-----------------------------------------------------------------\n"; | |
| 93 | |
| 94 ## Setup default variable values | |
| 95 use List::Util qw(first max maxstr min minstr reduce shuffle sum); | |
| 96 | |
| 97 $bindir = $0; | |
| 98 $bindir =~ s/_x//; | |
| 99 $input = ""; | |
| 100 $output = "ITSx_out"; | |
| 101 $hmmscan = ""; | |
| 102 $profileDB = "$bindir\_db/HMMs"; | |
| 103 $type = "all"; | |
| 104 $E = 1e-5; | |
| 105 $S = 0; | |
| 106 $N = 2; | |
| 107 $priority = "sum"; | |
| 108 $search_eval = 0.01; | |
| 109 $search_score = ""; | |
| 110 $allow_single_E = 1e-9; | |
| 111 $allow_single_score = 0; | |
| 112 #$allow_single_E = -1; # Turns off single-domain matching by E-value | |
| 113 #$allow_single_score = 0; # Turns off single-domain matching by score | |
| 114 $allow_reorder = 0; | |
| 115 $complement = 1; | |
| 116 $cpu = 1; | |
| 117 $multi_thread = "unset"; | |
| 118 $heuristics = 0; | |
| 119 $out_sum = 1; | |
| 120 $out_graph = 1; | |
| 121 $out_fasta = 1; | |
| 122 $out_preserve = 0; | |
| 123 $out_ssu = 0; | |
| 124 $out_its1 = 1; | |
| 125 $out_its2 = 1; | |
| 126 $out_58S = 0; | |
| 127 $out_lsu = 0; | |
| 128 $out_pos = 1; | |
| 129 $out_table = 0; | |
| 130 $out_not = 1; | |
| 131 $out_date = 0; | |
| 132 $out_joined = 0; | |
| 133 $out_results = 0; | |
| 134 $out_partial = 0; | |
| 135 $out_concat = 0; | |
| 136 $concat_minlen = 0; | |
| 137 $truncate = 1; | |
| 138 $anchor = 0; | |
| 139 $only_full = 0; | |
| 140 $graph_scale = 0; | |
| 141 $debug = 0; | |
| 142 $reset = 0; | |
| 143 | |
| 144 ## Read command-line options | |
| 145 for ($i = 0; $i <= scalar(@ARGV); $i++) { # Goes through the list of arguments | |
| 146 $arg = @ARGV[$i]; # Stores the current argument in $arg | |
| 147 | |
| 148 if ($arg eq "-i") { # Read input files from -i flag | |
| 149 $i++; | |
| 150 $input = @ARGV[$i]; | |
| 151 } | |
| 152 if ($arg eq "-o") { # Read output files from -o flag | |
| 153 $i++; | |
| 154 $output = @ARGV[$i]; | |
| 155 } | |
| 156 if ($arg eq "-p") { # Read profile database from -p flag | |
| 157 $i++; | |
| 158 $profileDB = @ARGV[$i]; | |
| 159 } | |
| 160 if ($arg eq "--hmmscan") { # Read pre-computed hmmscan output file from --hmmscan flag ('undocumented' feature) | |
| 161 $i++; | |
| 162 $hmmscan = @ARGV[$i]; | |
| 163 } | |
| 164 if ($arg eq "--date") { # Determine whether or not to add a date stamp based on the --date flag | |
| 165 $i++; | |
| 166 if (substr(@ARGV[$i],0,1) =~ m/^[Ff0]/) { # Check if argument begins with "F", "f", or "0" | |
| 167 $out_date = 0; | |
| 168 } else { | |
| 169 $out_date = 1; | |
| 170 } | |
| 171 } | |
| 172 if ($arg eq "--reset") { # Reset HMM database? | |
| 173 $i++; | |
| 174 if (substr(@ARGV[$i],0,1) =~ m/^[Ff0]/) { # Check if argument begins with "F", "f", or "0" | |
| 175 $reset = 0; | |
| 176 } else { | |
| 177 $reset = 1; | |
| 178 } | |
| 179 } | |
| 180 | |
| 181 if ($arg eq "-t") { # Select what types of ITSs to look for using the -t flag | |
| 182 $i++; | |
| 183 $type = @ARGV[$i]; | |
| 184 } | |
| 185 if ($arg eq "-E") { # Set the E-value cutoff using the -E flag | |
| 186 $i++; | |
| 187 $E = @ARGV[$i]; | |
| 188 } | |
| 189 if ($arg eq "-S") { # Set the score cutoff using the -S flag | |
| 190 $i++; | |
| 191 $S = @ARGV[$i]; | |
| 192 } | |
| 193 if ($arg eq "-N") { # Set the number of found domains cutoff using the -N flag | |
| 194 $i++; | |
| 195 $N = @ARGV[$i]; | |
| 196 } | |
| 197 if ($arg eq "--selection_priority") { # Set how to order the ITS types using the --selection_priority flag | |
| 198 $i++; | |
| 199 $priority = @ARGV[$i]; | |
| 200 } | |
| 201 if ($arg eq "--search_eval") { # Set the E-value cutoff for the HMMER search using the --search_eval flag | |
| 202 $i++; | |
| 203 $search_eval = @ARGV[$i]; | |
| 204 $search_score = ""; # Turns off score cutoff | |
| 205 } | |
| 206 if ($arg eq "--search_score") { # Set the score cutoff for the HMMER search using the --search_score flag | |
| 207 $i++; | |
| 208 $search_score = @ARGV[$i]; | |
| 209 $search_eval = ""; # Turns off E-value cutoff | |
| 210 } | |
| 211 if ($arg eq "--allow_single_domain") { # Determine whether or not to allow single domain matches based on the --allow_single_domain flag | |
| 212 $i++; | |
| 213 if (substr(@ARGV[$i],0,1) =~ m/^[Ff0]/) { # Check if argument begins with "F", "f", or "0" | |
| 214 $allow_single_E = -1; # Turns off single-domain matching by E-value | |
| 215 $allow_single_score = 0; # Turns off single-domain matching by score | |
| 216 } else { | |
| 217 ($allow_single_E,$allow_single_score) = split(',',@ARGV[$i]); # Turns on single-domain matching, assigning the first given value as the E-value cutoff, and the second as score cutoff | |
| 218 } | |
| 219 } | |
| 220 if ($arg eq "--allow_reorder") { # Determine whether or not to allow the domains to be in the wrong order based on the --allow_reorder flag | |
| 221 $i++; | |
| 222 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 223 $allow_reorder = 1; | |
| 224 } else { | |
| 225 $allow_reorder = 0; | |
| 226 } | |
| 227 } | |
| 228 if ($arg eq "--complement") { # Determine whether or not to scan the complementary strand of the input file based on the --complement flag | |
| 229 $i++; | |
| 230 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 231 $complement = 1; | |
| 232 } else { | |
| 233 $complement = 0; | |
| 234 } | |
| 235 } | |
| 236 if ($arg eq "--cpu") { # Set the number of CPUs to use based on the --cpu flag | |
| 237 $i++; | |
| 238 $cpu = @ARGV[$i]; | |
| 239 } | |
| 240 if ($arg eq "--multi_thread") { # Determine whether or not to multi-thread the HMMER step based on the --multi_thread flag | |
| 241 $i++; | |
| 242 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 243 $multi_thread = 1; | |
| 244 } else { | |
| 245 $multi_thread = 0; | |
| 246 } | |
| 247 } | |
| 248 if ($arg eq "--heuristics") { # Determine whether or not to use HMMER's heuristic filtering based on the --heuristics flag | |
| 249 $i++; | |
| 250 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 251 $heuristics = 1; | |
| 252 } else { | |
| 253 $heuristics = 0; | |
| 254 } | |
| 255 } | |
| 256 | |
| 257 if ($arg eq "--summary") { # Determine whether or not to output a summary based on the --summary flag | |
| 258 $i++; | |
| 259 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 260 $out_sum = 1; | |
| 261 } else { | |
| 262 $out_sum = 0; | |
| 263 } | |
| 264 } | |
| 265 if ($arg eq "--graphical") { # Determine whether or not to output a graphical representation of matches based on the --graphical flag | |
| 266 $i++; | |
| 267 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 268 $out_graph = 1; | |
| 269 } else { | |
| 270 $out_graph = 0; | |
| 271 } | |
| 272 } | |
| 273 if ($arg eq "--detailed_results") { # Determine whether or not to output a detailed results list, based on the --detailed_results flag | |
| 274 $i++; | |
| 275 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 276 $out_results = 1; | |
| 277 } else { | |
| 278 $out_results = 0; | |
| 279 } | |
| 280 } | |
| 281 if ($arg eq "--partial") { # Set the full-and-partial cutoff | |
| 282 $i++; | |
| 283 $out_partial = @ARGV[$i]; | |
| 284 } | |
| 285 if ($arg eq "--anchor") { # Set the length of the sequence "anchors" | |
| 286 $i++; | |
| 287 $anchor = @ARGV[$i]; | |
| 288 } | |
| 289 if ($arg eq "--only_full") { # Output only full-length regions | |
| 290 $i++; | |
| 291 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 292 $only_full = 1; | |
| 293 } else { | |
| 294 $only_full = 0; | |
| 295 } | |
| 296 } | |
| 297 if ($arg eq "--save_regions") { # Determine which regions to output FASTA files for based on the --save_regions flag | |
| 298 $i++; | |
| 299 @save_regions = split(',',uc(@ARGV[$i])); | |
| 300 $out_ssu = 0; | |
| 301 $out_its1 = 0; | |
| 302 $out_its2 = 0; | |
| 303 $out_58S = 0; | |
| 304 $out_lsu = 0; | |
| 305 foreach $save_region (@save_regions) { | |
| 306 if ($save_region eq "SSU") { | |
| 307 $out_ssu = 1; | |
| 308 } | |
| 309 if ($save_region eq "ITS1") { | |
| 310 $out_its1 = 1; | |
| 311 } | |
| 312 if ($save_region eq "5.8S") { | |
| 313 $out_58S = 1; | |
| 314 } | |
| 315 if ($save_region eq "ITS2") { | |
| 316 $out_its2 = 1; | |
| 317 } | |
| 318 if ($save_region eq "LSU") { | |
| 319 $out_lsu = 1; | |
| 320 } | |
| 321 if ($save_region eq "ALL") { | |
| 322 $out_ssu = 1; | |
| 323 $out_its1 = 1; | |
| 324 $out_its2 = 1; | |
| 325 $out_58S = 1; | |
| 326 $out_lsu = 1; | |
| 327 } | |
| 328 if ($save_region eq "NONE") { | |
| 329 $out_ssu = 0; | |
| 330 $out_its1 = 0; | |
| 331 $out_its2 = 0; | |
| 332 $out_58S = 0; | |
| 333 $out_lsu = 0; | |
| 334 } | |
| 335 } | |
| 336 } | |
| 337 if ($arg eq "--positions") { # Determine whether or not to output a positions file based on the --positions flag | |
| 338 $i++; | |
| 339 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 340 $out_pos = 1; | |
| 341 } else { | |
| 342 $out_pos = 0; | |
| 343 } | |
| 344 } | |
| 345 if ($arg eq "--concat") { # Determine whether or not to output a concatednated ITS1 + ITS2 file | |
| 346 $i++; | |
| 347 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 348 $out_concat = 1; | |
| 349 } else { | |
| 350 $out_concat = 0; | |
| 351 } | |
| 352 } | |
| 353 if ($arg eq "--minlen") { # Set the min length of the combined ITS1 and ITS2 sequences for concatenation | |
| 354 $i++; | |
| 355 $concat_minlen = @ARGV[$i]; | |
| 356 } | |
| 357 if ($arg eq "--fasta") { # Determine whether or not to output FASTA-files based on the --fasta flag | |
| 358 $i++; | |
| 359 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 360 $out_fasta = 1; | |
| 361 } else { | |
| 362 $out_fasta = 0; | |
| 363 } | |
| 364 } | |
| 365 if ($arg eq "--preserve") { # Determine whether or not to preserve FASTA-headers based on the --preserve flag | |
| 366 $i++; | |
| 367 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 368 $out_preserve = 1; | |
| 369 } else { | |
| 370 $out_preserve = 0; | |
| 371 } | |
| 372 } | |
| 373 if ($arg eq "--joined") { # Determine whether or not to output a FASTA-file containing ALL sorts of output sequences (for debugging) | |
| 374 $i++; | |
| 375 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 376 $out_joined = 1; | |
| 377 } else { | |
| 378 $out_joined = 0; | |
| 379 } | |
| 380 } | |
| 381 if ($arg eq "--table") { # Determine whether or not to output tables of all potential matches based on the --table flag | |
| 382 $i++; | |
| 383 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 384 $out_table = 1; | |
| 385 } else { | |
| 386 $out_table = 0; | |
| 387 } | |
| 388 } | |
| 389 if ($arg eq "--not_found") { # Determine whether or not to output a list of sequences that are not ITSs based on the --not_found flag | |
| 390 $i++; | |
| 391 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 392 $out_not = 1; | |
| 393 } else { | |
| 394 $out_not = 0; | |
| 395 } | |
| 396 } | |
| 397 if ($arg eq "--silent") { # Determine whether or not to output anything to the screen based on the --silent flag | |
| 398 $i++; | |
| 399 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 400 $silent = 1; | |
| 401 } else { | |
| 402 $silent = 0; | |
| 403 } | |
| 404 } | |
| 405 if ($arg eq "--graph_scale") { # Set the scale of the graphical output based on the --graph_scale flag | |
| 406 $i++; | |
| 407 $graph_scale = @ARGV[$i]; | |
| 408 } | |
| 409 if ($arg eq "--save_raw") { # Determine whether or not to save all the raw intermediate data based on the --save_raw flag | |
| 410 $i++; | |
| 411 if (substr(@ARGV[$i],0,1) =~ m/^[Tt1]/) { # Check if argument begins with "T", "t", or "1" | |
| 412 $save_raw = 1; | |
| 413 } else { | |
| 414 $save_raw = 0; | |
| 415 } | |
| 416 } | |
| 417 | |
| 418 ## If "-h" or "--help" are among the options, output usage data and options | |
| 419 if (($arg eq "-h") || ($arg eq "--help")) { | |
| 420 print "Usage: ITSx -i <input file> -o <output file>\nOptions:$options"; | |
| 421 print "-----------------------------------------------------------------\n"; | |
| 422 exit; # Exit ITSx | |
| 423 } | |
| 424 | |
| 425 ## If "--bugs" is among the options, output bugs and features information | |
| 426 if ($arg eq "--bugs") { | |
| 427 print "$bugs\n"; | |
| 428 exit; # Exit ITSx | |
| 429 } | |
| 430 | |
| 431 ## If "--license" is among the options, output license information | |
| 432 if ($arg eq "--license") { | |
| 433 print "$license\n"; | |
| 434 exit; # Exit ITSx | |
| 435 } | |
| 436 | |
| 437 if ($arg eq "--debug") { # Run ITSx in debug mode | |
| 438 $debug = 1; | |
| 439 } | |
| 440 if ($arg eq "--pipeline") { # Run ITSx in pipeline mode | |
| 441 $pipeline = 1; | |
| 442 } | |
| 443 } | |
| 444 | |
| 445 ## Setup some variables dependent on input | |
| 446 | |
| 447 if ($multi_thread eq "unset") { # If the multi-thread option is not set | |
| 448 if ($cpu > 1) { # Then if the number of CPUs used > 1, then multi-thread HMMER searches | |
| 449 $multi_thread = 1; | |
| 450 } else { # Else, run HMMER searches sequentially on one CPU | |
| 451 $multi_thread = 0; | |
| 452 } | |
| 453 } | |
| 454 | |
| 455 if ($hmmscan ne "") { # If a pre-computed hmmscan output is supplied | |
| 456 $output = $hmmscan; # Then set the base of the output directory name to be the same as that hmmscan output file | |
| 457 } | |
| 458 | |
| 459 ## Check for binaries | |
| 460 | |
| 461 chomp($path = `which hmmpress`); # Get the path for hmmpress | |
| 462 if ($path eq "") { # If the path is empty, then show an error message and exit ITSx | |
| 463 print STDERR "FATAL ERROR :: Could not locate HMMER binaries! It seems that hmmpress is not installed properly.\ | |
| 464 Consult the manual for installation instructions. Note that HMMER3 is required. Previous HMMER-versions will not work.\ | |
| 465 This error is fatal, and ITSx will now abort.\n"; | |
| 466 print STDERR "-----------------------------------------------------------------\n"; | |
| 467 exit; | |
| 468 } | |
| 469 | |
| 470 chomp($path = `which hmmscan`); # Get the path for hmmscan | |
| 471 if ($path eq "") { # If the path is empty, then show an error message and exit ITSx | |
| 472 print STDERR "FATAL ERROR :: Could not locate HMMER binaries! It seems that hmmscan is not installed properly.\ | |
| 473 Consult the manual for installation instructions. Note that HMMER3 is required. Previous HMMER-versions will not work.\ | |
| 474 This error is fatal, and ITSx will now abort.\n"; | |
| 475 print STDERR "-----------------------------------------------------------------\n"; | |
| 476 exit; | |
| 477 } | |
| 478 | |
| 479 | |
| 480 ## Check for database | |
| 481 chomp($errormsg = `ls $profileDB* 2>&1 1>/dev/null`); # Get the error msg when looking for the profile database | |
| 482 if (substr($errormsg,0,4) eq "ls: ") { # If the error message begins with "ls: ", then show an error message and exit ITSx | |
| 483 print STDERR "FATAL ERROR :: The specified profile database could not be found.\ | |
| 484 Consult the manual for installation instructions.\ | |
| 485 This error is fatal, and ITSx will now abort.\n"; | |
| 486 print STDERR "-----------------------------------------------------------------\n"; | |
| 487 exit; | |
| 488 } | |
| 489 | |
| 490 if ($pipeline == 0) { # If ITSx is not run in pipeline mode (i.e. from ITSx) | |
| 491 if ($out_date == 1) { # If a date and time stamp should be supplied | |
| 492 ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); # Get the date and time | |
| 493 $year = $year + 1900; # Format the year | |
| 494 $mon = $mon + 1; # Format the month | |
| 495 if ($mon < 10) { # Add a zero to the month, if needed | |
| 496 $mon = "0" . $mon; | |
| 497 } | |
| 498 if ($mday < 10) { # Add a zero to the day, if needed | |
| 499 $mday = "0" . $mday; | |
| 500 } | |
| 501 if ($hour < 10) { # Add a zero to the hour, if needed | |
| 502 $hour = "0" . $hour; | |
| 503 } | |
| 504 if ($min < 10) { # Add a zero to the minute, if needed | |
| 505 $min = "0" . $min; | |
| 506 } | |
| 507 $outputDate = ".$year\-$mon\-$mday\_$hour.$min"; # Create a date and time stamp | |
| 508 $outputDate =~ s./.-.g; # Remove any potential slashes in the output name (as this will confuse ITSx's file naming) | |
| 509 $output = $output . $outputDate; # Add the date and time stamp top the output base name | |
| 510 } | |
| 511 } | |
| 512 | |
| 513 $tempDir = "ITSx_temp_directory__$output"; # Setup a temporary directory variable | |
| 514 $tempDir =~ s./.-.g; # Remove any potential slashes in the output name (as this will confuse ITSx's file naming) | |
| 515 | |
| 516 if ($pipeline == 0) { # If not running in pipeline mode | |
| 517 ## Create a summary file | |
| 518 if ($out_sum == 1) { # If summary output is on | |
| 519 $now = localtime; # Get the current time | |
| 520 open (SUMMARY, ">$output.summary.txt"); # Create the summary file | |
| 521 print SUMMARY "ITSx run started at $now.\n"; # Output the starting time for the analysis | |
| 522 print SUMMARY "-----------------------------------------------------------------\n"; | |
| 523 close (SUMMARY); # Close summary file | |
| 524 } | |
| 525 } | |
| 526 | |
| 527 ## Create a temporary directory for ITSx | |
| 528 if ($pipeline == 0) { # If ITSx is not run in pipeline mode (i.e. from ITSx) | |
| 529 `mkdir $tempDir 2> /dev/null`; # Create a temporary directory | |
| 530 } | |
| 531 | |
| 532 | |
| 533 ## Prepare profile database | |
| 534 ## Get the current time and output info message | |
| 535 $now = localtime; | |
| 536 if ($silent == 0) { | |
| 537 print STDERR "$now : Preparing HMM database (should be quick)...\n"; | |
| 538 } | |
| 539 | |
| 540 ## Setup profile index | |
| 541 %profileIndex = {}; | |
| 542 | |
| 543 # A = Alveolata | |
| 544 # B = Bryophyta | |
| 545 # C = Bacillariophyta | |
| 546 # D = Amoebozoa | |
| 547 # E = Euglenozoa | |
| 548 # F = Fungi | |
| 549 # G = Chlorophyta (green algae) | |
| 550 # H = Rhodophyta (red algae) | |
| 551 # I = Phaeophyceae (brown algae) | |
| 552 # L = Marchantiophyta (liverworts) | |
| 553 # M = Metazoa | |
| 554 # N = Microsporidia | |
| 555 # O = Oomycota | |
| 556 # P = Haptophyceae (prymnesiophytes) | |
| 557 # Q = Raphidophyceae | |
| 558 # R = Rhizaria | |
| 559 # S = Synurophyceae | |
| 560 # T = Tracheophyta (higher plants) | |
| 561 # U = Eustigmatophyceae | |
| 562 # X = Apusozoa | |
| 563 # Y = Parabasalia | |
| 564 | |
| 565 $profileIndex{"A"} = "alveolates"; | |
| 566 $profileIndex{"B"} = "bryophyta"; | |
| 567 $profileIndex{"C"} = "bacillariophyta"; | |
| 568 $profileIndex{"D"} = "amoebozoa"; | |
| 569 $profileIndex{"E"} = "euglenozoa"; | |
| 570 $profileIndex{"F"} = "fungi"; | |
| 571 $profileIndex{"G"} = "chlorophyta"; | |
| 572 $profileIndex{"H"} = "rhodophyta"; | |
| 573 $profileIndex{"I"} = "phaeophyceae"; | |
| 574 $profileIndex{"J"} = "undefined"; | |
| 575 $profileIndex{"K"} = "undefined"; | |
| 576 $profileIndex{"L"} = "marchantiophyta"; | |
| 577 $profileIndex{"M"} = "metazoa"; | |
| 578 $profileIndex{"N"} = "microsporidia"; | |
| 579 $profileIndex{"O"} = "oomycota"; | |
| 580 $profileIndex{"P"} = "haptophyceae"; | |
| 581 $profileIndex{"Q"} = "raphidophyceae"; | |
| 582 $profileIndex{"R"} = "rhizaria"; | |
| 583 $profileIndex{"S"} = "synurophyceae"; | |
| 584 $profileIndex{"T"} = "tracheophyta"; | |
| 585 $profileIndex{"U"} = "eustigmatophyceae"; | |
| 586 $profileIndex{"V"} = "undefined"; | |
| 587 $profileIndex{"W"} = "undefined"; | |
| 588 $profileIndex{"X"} = "apusozoa"; | |
| 589 $profileIndex{"Y"} = "parabasalia"; | |
| 590 $profileIndex{"Z"} = "undefined"; | |
| 591 | |
| 592 | |
| 593 @profileList = split(',',uc($type)); # Get the list of profile types | |
| 594 foreach $entry (@profileList) { # Go through the entered types | |
| 595 if (($entry eq "ALL") || ($entry eq ".")) { # If "all" among the entries | |
| 596 push(@profileSet,"A"); # Add the alveolates profiles to the investigation set | |
| 597 push(@profileSet,"B"); # Add the bryophytes profiles to the investigation set | |
| 598 push(@profileSet,"C"); # Add the bacillariophyta profiles to the investigation set | |
| 599 push(@profileSet,"D"); # Add the amoebozoa profiles to the investigation set | |
| 600 push(@profileSet,"E"); # Add the euglenozoa profiles to the investigation set | |
| 601 push(@profileSet,"F"); # Add the fungi profiles to the investigation set | |
| 602 push(@profileSet,"G"); # Add the green algae profiles to the investigation set | |
| 603 push(@profileSet,"H"); # Add the red algae profiles to the investigation set | |
| 604 push(@profileSet,"I"); # Add the brown algae profiles to the investigation set | |
| 605 push(@profileSet,"L"); # Add the liverworts profiles to the investigation set | |
| 606 push(@profileSet,"M"); # Add the metazoa profiles to the investigation set | |
| 607 #push(@profileSet,"N"); # Add the microsporidia profiles to the investigation set | |
| 608 push(@profileSet,"O"); # Add the oomycetes profiles to the investigation set | |
| 609 push(@profileSet,"P"); # Add the prymnesiophytes profiles to the investigation set | |
| 610 push(@profileSet,"Q"); # Add the raphidophytes profiles to the investigation set | |
| 611 push(@profileSet,"R"); # Add the rhizaria profiles to the investigation set | |
| 612 push(@profileSet,"S"); # Add the synurophyceae profiles to the investigation set | |
| 613 push(@profileSet,"T"); # Add the tracheophyta (higher plants) profiles to the investigation set | |
| 614 push(@profileSet,"U"); # Add the eustigmatophytes profiles to the investigation set | |
| 615 #push(@profileSet,"X"); # Add the apusozoa profiles to the investigation set | |
| 616 #push(@profileSet,"Y"); # Add the parabasalia profiles to the investigation set | |
| 617 } else { | |
| 618 if (length($entry) == 1) { # If the name has only one character | |
| 619 if ($entry =~ m/[ABCDERFHILMNOPQRSTUXY]/) { # If the selected set exists | |
| 620 push(@profileSet,$entry); # Add the selected profiles to the investigation set | |
| 621 } | |
| 622 } else { | |
| 623 if ($entry =~ m/ALVEOL/) { | |
| 624 push(@profileSet,"A"); # Add the alveolates profiles to the investigation set | |
| 625 } | |
| 626 if ($entry =~ m/BRYO/) { | |
| 627 push(@profileSet,"B"); # Add the bryophytes profiles to the investigation set | |
| 628 } | |
| 629 if ($entry =~ m/MOSS/) { | |
| 630 push(@profileSet,"B"); # Add the bryophytes profiles to the investigation set | |
| 631 } | |
| 632 if ($entry =~ m/BACILL/) { | |
| 633 push(@profileSet,"C"); # Add the bacillariophyta profiles to the investigation set | |
| 634 } | |
| 635 if ($entry =~ m/DIATOM/) { | |
| 636 push(@profileSet,"C"); # Add the bacillariophyta profiles to the investigation set | |
| 637 } | |
| 638 if ($entry =~ m/AMOEB/) { | |
| 639 push(@profileSet,"D"); # Add the amoebozoa profiles to the investigation set | |
| 640 } | |
| 641 if ($entry =~ m/EUGLE/) { | |
| 642 push(@profileSet,"E"); # Add the euglenozoa profiles to the investigation set | |
| 643 } | |
| 644 if ($entry =~ m/FUNG/) { | |
| 645 push(@profileSet,"F"); # Add the fungi profiles to the investigation set | |
| 646 } | |
| 647 if ($entry =~ m/GREEN/) { | |
| 648 push(@profileSet,"G"); # Add the green algae profiles to the investigation set | |
| 649 } | |
| 650 if ($entry =~ m/CHLORO/) { | |
| 651 push(@profileSet,"G"); # Add the green algae profiles to the investigation set | |
| 652 } | |
| 653 if ($entry =~ m/RED-AL/) { | |
| 654 push(@profileSet,"H"); # Add the red algae profiles to the investigation set | |
| 655 } | |
| 656 if ($entry =~ m/RHODO/) { | |
| 657 push(@profileSet,"H"); # Add the red algae profiles to the investigation set | |
| 658 } | |
| 659 if ($entry =~ m/BROWN/) { | |
| 660 push(@profileSet,"I"); # Add the brown algae profiles to the investigation set | |
| 661 } | |
| 662 if ($entry =~ m/PHAEOP/) { | |
| 663 push(@profileSet,"I"); # Add the brown algae profiles to the investigation set | |
| 664 } | |
| 665 if ($entry =~ m/LIVER/) { | |
| 666 push(@profileSet,"L"); # Add the liverworts profiles to the investigation set | |
| 667 } | |
| 668 if ($entry =~ m/MARCH/) { | |
| 669 push(@profileSet,"L"); # Add the liverworts profiles to the investigation set | |
| 670 } | |
| 671 if ($entry =~ m/METAZ/) { | |
| 672 push(@profileSet,"M"); # Add the metazoa profiles to the investigation set | |
| 673 } | |
| 674 if ($entry =~ m/ANIMAL/) { | |
| 675 push(@profileSet,"M"); # Add the metazoa profiles to the investigation set | |
| 676 } | |
| 677 if ($entry =~ m/MICROSPOR/) { | |
| 678 push(@profileSet,"N"); # Add the microsporidia profiles to the investigation set | |
| 679 } | |
| 680 if ($entry =~ m/OOMYC/) { | |
| 681 push(@profileSet,"O"); # Add the oomycetes profiles to the investigation set | |
| 682 } | |
| 683 if ($entry =~ m/PRYMN/) { | |
| 684 push(@profileSet,"P"); # Add the prymnesiophytes profiles to the investigation set | |
| 685 } | |
| 686 if ($entry =~ m/HAPTO/) { | |
| 687 push(@profileSet,"P"); # Add the prymnesiophytes profiles to the investigation set | |
| 688 } | |
| 689 if ($entry =~ m/RAPHID/) { | |
| 690 push(@profileSet,"Q"); # Add the raphidophytes profiles to the investigation set | |
| 691 } | |
| 692 if ($entry =~ m/RHIZA/) { | |
| 693 push(@profileSet,"R"); # Add the rhizaria profiles to the investigation set | |
| 694 } | |
| 695 if ($entry =~ m/SYNUR/) { | |
| 696 push(@profileSet,"S"); # Add the synurophyceae profiles to the investigation set | |
| 697 } | |
| 698 if ($entry =~ m/TRACHE/) { | |
| 699 push(@profileSet,"T"); # Add the tracheophyta profiles to the investigation set | |
| 700 } | |
| 701 if ($entry =~ m/PLANTS/) { | |
| 702 push(@profileSet,"T"); # Add the tracheophyta profiles to the investigation set | |
| 703 } | |
| 704 if ($entry =~ m/EUSTIG/) { | |
| 705 push(@profileSet,"U"); # Add the eustigmatophytes profiles to the investigation set | |
| 706 } | |
| 707 if ($entry =~ m/APUSO/) { | |
| 708 push(@profileSet,"X"); # Add the apusozoa profiles to the investigation set | |
| 709 } | |
| 710 if ($entry =~ m/PARAB/) { | |
| 711 push(@profileSet,"Y"); # Add the parabasalia profiles to the investigation set | |
| 712 } | |
| 713 } | |
| 714 } | |
| 715 } | |
| 716 | |
| 717 foreach $set (@profileSet) { # For each set of profiles in the the full profile set for investigation | |
| 718 $hmmPath = $profileDB . "/" . $set . ".hmm"; # Determine the path to the HMM-file | |
| 719 chomp($modelCount = `grep -c "//" $hmmPath`); # Count the number of models in the HMM-file | |
| 720 push(@modelCount,$modelCount); # Add the number of models in this HMM-file to the list of model counts | |
| 721 if ($reset == 1) { | |
| 722 `rm -f $hmmPath.h3* 2> /dev/null`; # Delete old HMM-files | |
| 723 `hmmpress $hmmPath 2> /dev/null`; # Prepare the HMM-file for searching | |
| 724 ## Redirecting stderr is a quick and dirty solution to get rid of the messages... Could be made more elegant | |
| 725 } | |
| 726 } | |
| 727 | |
| 728 ## Clean-up input files and create complementary strand if needed | |
| 729 ## Get the current time and output an info message | |
| 730 $now = localtime; | |
| 731 if ($silent == 0) { | |
| 732 print STDERR "$now : Checking and handling input sequence data (should not take long)...\n"; | |
| 733 } | |
| 734 | |
| 735 ## Open the summary file for writing | |
| 736 if ($out_sum == 1) { # If summary output is on | |
| 737 open (SUMMARY, ">>$output.summary.txt"); # Append to the summary file | |
| 738 } | |
| 739 | |
| 740 if ($input ne "") { # If an input file is given | |
| 741 ## Read from file | |
| 742 open (SEQUENCES, $input); # Open the input file for reading | |
| 743 open (MAIN, ">$tempDir/main.fasta"); # Create a temporary file for storing the cleaned sequences representing the main strand | |
| 744 open (COMPLEMENT, ">$tempDir/complement.fasta"); # Create a temporary file for storing the cleaned sequences representing the complementary strand | |
| 745 $inputSequenceCount = 0; # Reset input sequence counter | |
| 746 while ($sequence = <SEQUENCES>) { # Repeat for every line in the input file | |
| 747 chomp($sequence); # Truncate any potential line feeds | |
| 748 $sequence =~ tr/\r\n//d; # Remove all carriage return and new line characters | |
| 749 if (substr($sequence,0,1) eq ">") { # If a new FASTA entry is found in the input file | |
| 750 $inputSequenceCount++; # Add one to the input sequence counter | |
| 751 print MAIN $mainSeq . "\n"; # Write the previous main DNA sequence to the main sequence file | |
| 752 print MAIN $sequence . " main\n"; # Write the definition line of the new sequence to the main sequence file | |
| 753 $sequenceDB{"$sequenceID"} = $mainSeq; # Add sequence to sequence database | |
| 754 $headers{"$sequenceID"} = $header; # Add the header to the header database | |
| 755 push(@sequenceOrder,$sequenceID); # Add this sequence ID to the ordered list of sequences | |
| 756 ($sequenceID) = split(" ",substr($sequence,1)); | |
| 757 $header = $sequence; # Save the sequence header | |
| 758 $mainSeq = ""; # Empty the main sequence entry | |
| 759 if ($complement == 1) { # If the complementary file should be written | |
| 760 $complementSeq = reverse($complementSeq); # Reverse the complementary DNA sequence | |
| 761 print COMPLEMENT $complementSeq . "\n"; # Write the previous complementary DNA sequence to the complementary sequence file | |
| 762 print COMPLEMENT $sequence . " complement\n"; # Write the definition line of the new sequence to the complement sequence file | |
| 763 $complementSeq = ""; # Empty the complementary sequence entry | |
| 764 } | |
| 765 } else { # If this line is just a continuation of the current DNA sequence | |
| 766 $mseq = $sequence; # Store this part of the DNA sequence in the intermediate varaible $mseq | |
| 767 $mseq =~ s/[ .-]//g; # Remove any gap characters present in this sequence part (good if input was an alignment) | |
| 768 $mseq =~ tr/[a-z]/[A-Z]/; # Make all letters uppercase | |
| 769 $mseq =~ s/[^A-Z]//g; # Remove all non-alphabetic characters | |
| 770 $mseq =~ tr/U/T/; # Exchanges U:s for T:s (Uracil to Thymine, good if input was RNA sequence) | |
| 771 $mainSeq = $mainSeq . $mseq; # Add the intermediate DNA sequence to the end of the main DNA sequence entry | |
| 772 | |
| 773 if ($complement == 1) { # If the complementary file should be written | |
| 774 $cseq = $sequence; # Store this part of the DNA sequence in the intermediate varaible $cseq | |
| 775 $cseq =~ s/[ .-]//g; # Remove any gap characters present in this sequence part (good if input was an alignment) | |
| 776 $cseq =~ tr/[a-z]/[A-Z]/; # Make all letters uppercase | |
| 777 $cseq =~ s/[^A-Z]//g; # Remove all non-alphabetic characters | |
| 778 $cseq =~ tr/ACGTURYSWKMBDHVN/TGCAAYRWSMKVHDBN/; # Replace all characters with its complementary base | |
| 779 $complementSeq = $complementSeq . $cseq; # Add the intermediate DNA sequence to the end of the complementary DNA sequence entry | |
| 780 } | |
| 781 } | |
| 782 } | |
| 783 ## When the input file's end is reached | |
| 784 print MAIN $mainSeq . "\n"; # Write the last main DNA sequence to the main sequence file | |
| 785 if ($complement == 1) { # If the complementary file should be written | |
| 786 $complementSeq = reverse($complementSeq); # Reverse the complementary DNA sequence | |
| 787 print COMPLEMENT $complementSeq . "\n"; # Write the last complementary DNA sequence to the complementary sequence file | |
| 788 } | |
| 789 $sequenceDB{"$sequenceID"} = $mainSeq; # Add sequence to sequence database | |
| 790 $headers{"$sequenceID"} = $header; # Add the header to the header database | |
| 791 push(@sequenceOrder,$sequenceID); # Add this sequence ID to the ordered list of sequences | |
| 792 $mainSeq = ""; # Empty the main sequence entry | |
| 793 $complementSeq = ""; # Empty the complementary sequence entry | |
| 794 close (SEQUENCES); # Close the sequence input file | |
| 795 close (COMPLEMENT); # Close the complementary output file | |
| 796 close (MAIN); # Close the main output file | |
| 797 } else { # If no input file is supplied, then read from stdin instead | |
| 798 $input = "$tempDir/main.fasta"; # Set up a temporary input file path | |
| 799 open (MAIN, ">$tempDir/main.fasta"); # Create a temporary file for storing the cleaned sequences representing the main strand | |
| 800 open (COMPLEMENT, ">$tempDir/complement.fasta"); # Create a temporary file for storing the cleaned sequences representing the complementary strand | |
| 801 $inputSequenceCount = 0; # Reset input sequence counter | |
| 802 while ($sequence = <STDIN>) { # Repeat for every line in the standard input | |
| 803 chomp($sequence); # Truncate any potential line feeds | |
| 804 if (substr($sequence,0,1) eq ">") { # If a new FASTA entry is found in the input | |
| 805 $inputSequenceCount++; # Add one to the input sequence counter | |
| 806 print MAIN $mainSeq . "\n"; # Write the previous main DNA sequence to the main sequence file | |
| 807 print MAIN $sequence . " main\n"; # Write the definition line of the new sequence to the main sequence file | |
| 808 $sequenceDB{"$sequenceID"} = $mainSeq; # Add sequence to sequence database | |
| 809 $headers{"$sequenceID"} = $header; # Add the header to the header database | |
| 810 push(@sequenceOrder,$sequenceID); # Add this sequence ID to the ordered list of sequences | |
| 811 $sequenceID = split(" ",substr($sequence,1)); | |
| 812 $header = $sequence; # Save the sequence header | |
| 813 $mainSeq = ""; # Empty the main sequence entry | |
| 814 if ($complement == 1) { # If the complementary file should be written | |
| 815 $complementSeq = reverse($complementSeq); # Reverse the complementary DNA sequence | |
| 816 print COMPLEMENT $complementSeq . "\n"; # Write the previous complementary DNA sequence to the complementary sequence file | |
| 817 print COMPLEMENT $sequence . " complement\n"; # Write the definition line of the new sequence to the complement sequence file | |
| 818 $complementSeq = ""; # Empty the complementary sequence entry | |
| 819 } | |
| 820 } else { # If this line is just a continuation of the current DNA sequence | |
| 821 $mseq = $sequence; # Store this part of the DNA sequence in the intermediate varaible $mseq | |
| 822 $mseq =~ s/[ .-]//g; # Remove any gap characters present in this sequence part (good if input was an alignment) | |
| 823 $mseq =~ tr/[a-z]/[A-Z]/; # Make all letters uppercase | |
| 824 $mseq =~ s/[^A-Z]//g; # Remove all non-alphabetic characters | |
| 825 $mseq =~ tr/U/T/; # Exchanges U:s for T:s (Uracil to Thymine, good if input was RNA sequence) | |
| 826 $mainSeq = $mainSeq . $mseq; # Add the intermediate DNA sequence to the end of the main DNA sequence entry | |
| 827 | |
| 828 if ($complement == 1) { # If the complementary file should be written | |
| 829 $cseq = $sequence; # Store this part of the DNA sequence in the intermediate varaible $cseq | |
| 830 $cseq =~ s/[ .-]//g; # Remove any gap characters present in this sequence part (good if input was an alignment) | |
| 831 $cseq =~ tr/[a-z]/[A-Z]/; # Make all letters uppercase | |
| 832 $cseq =~ s/[^A-Z]//g; # Remove all non-alphabetic characters | |
| 833 $cseq =~ tr/ACGTURYSWKMBDHVN/TGCAAYRWSMKVHDBN/; # Replace all characters with its complementary base | |
| 834 $complementSeq = $complementSeq . $cseq; # Add the intermediate DNA sequence to the end of the complementary DNA sequence entry | |
| 835 } | |
| 836 } | |
| 837 } | |
| 838 ## When the input file's end is reached | |
| 839 print MAIN $mainSeq . "\n"; # Write the last main DNA sequence to the main sequence file | |
| 840 if ($complement == 1) { # If the complementary file should be written | |
| 841 $complementSeq = reverse($complementSeq); # Reverse the complementary DNA sequence | |
| 842 print COMPLEMENT $complementSeq . "\n"; # Write the last complementary DNA sequence to the complementary sequence file | |
| 843 } | |
| 844 $sequenceDB{"$sequenceID"} = $mainSeq; # Add sequence to sequence database | |
| 845 $headers{"$sequenceID"} = $header; # Add the header to the header database | |
| 846 push(@sequenceOrder,$sequenceID); # Add this sequence ID to the ordered list of sequences | |
| 847 $mainSeq = ""; # Empty the main sequence entry | |
| 848 $complementSeq = ""; # Empty the complementary sequence entry | |
| 849 close (COMPLEMENT); # Close the complementary output file | |
| 850 close (MAIN); # Close the main output file | |
| 851 } | |
| 852 | |
| 853 if ($out_sum == 1) { # If summary output should be written | |
| 854 print SUMMARY "Number of sequences in input file: \t$inputSequenceCount\n"; # Write info on the number of input sequences to the summary file | |
| 855 } | |
| 856 | |
| 857 | |
| 858 ## Perform HMM-scan | |
| 859 if ($hmmscan eq "") { # If a pre-computed hmmscan output file is not supplied | |
| 860 if ($heuristics == 0) { # If HMMER's heuristic filtering should not be used | |
| 861 $heurMax = "--max"; # Set the heurMax to "--max" (indicating that HMMER should turn off filtering) | |
| 862 } else { | |
| 863 $heurMax = ""; # Set the heurMax to empty (indicating that HMMER should turn on filtering) | |
| 864 } | |
| 865 if ($multi_thread == 0) { # If multi-threading is off | |
| 866 ## Get the current time and output info message | |
| 867 $now = localtime; | |
| 868 if ($silent == 0) { | |
| 869 print STDERR "$now : Comparing sequences to HMM database (this may take a long while)...\n"; | |
| 870 } | |
| 871 foreach $set (@profileSet) { # Go sequentially through all profile sets to search for | |
| 872 $hmmPath = $profileDB . "/" . $set . ".hmm"; # Set the path to the HMM-file of the current set | |
| 873 if ($search_eval ne "") { # If E-value cutoff is use for the search | |
| 874 hmmerSearch("hmmscan --cpu $cpu $heurMax -E $search_eval $hmmPath $tempDir/main.fasta","$tempDir/main.$set.hmmscan","M",$set); # Call HMMER with E-value cutoff | |
| 875 } else { # If score cutoff is use for the search | |
| 876 hmmerSearch("hmmscan --cpu $cpu $heurMax -T $search_score $hmmPath $tempDir/main.fasta","$tempDir/main.$set.hmmscan","M",$set); # Call HMMER with score cutoff | |
| 877 } | |
| 878 if ($complement == 1) { # If the complementary file should be scanned | |
| 879 if ($search_eval ne "") { # If E-value cutoff is use for the search | |
| 880 hmmerSearch("hmmscan --cpu $cpu $heurMax -E $search_eval $hmmPath $tempDir/complement.fasta","$tempDir/complement.$set.hmmscan","C",$set); # Call HMMER with E-value cutoff | |
| 881 } else { # If score cutoff is use for the search | |
| 882 hmmerSearch("hmmscan --cpu $cpu $heurMax -T $search_score $hmmPath $tempDir/complement.fasta","$tempDir/complement.$set.hmmscan","C",$set); # Call HMMER with score cutoff | |
| 883 } | |
| 884 } | |
| 885 } | |
| 886 } else { # If multi-threading is on | |
| 887 ## Get the current time and output info message | |
| 888 $now = localtime; | |
| 889 if ($silent == 0) { | |
| 890 print STDERR "$now : Doing paralellised comparison to HMM database (this may take a long while)...\n"; | |
| 891 } | |
| 892 | |
| 893 ## Determining number of cpus per thread | |
| 894 if ($complement == 1) { # If the complementary file should be scanned | |
| 895 $hmmcpu = int(0.5 * $cpu / scalar(@profileSet)); # Assign X CPUs to each thread, X = 0.5 * (TOTAL_CPUs_USED) / (TOTAL_NUMBER_OF_PROFILE_SETS) | |
| 896 } else { # If the complementary file should not be scanned | |
| 897 $hmmcpu = int($cpu / scalar(@profileSet)); # Assign X CPUs to each thread, X = (TOTAL_CPUs_USED) / (TOTAL_NUMBER_OF_PROFILE_SETS) | |
| 898 } | |
| 899 if ($hmmcpu < 1) { # If the number of CPUs per thread is smaller than 1 | |
| 900 $hmmcpu = 1; # Give each thread at least one CPU to work on | |
| 901 } | |
| 902 ## Main strand searches... | |
| 903 $cpuCount = 0; | |
| 904 foreach $set (@profileSet) { # Go through each profile set to investigate | |
| 905 if ($cpuCount < $cpu) { | |
| 906 $cpuCount++; | |
| 907 $pid = fork(); # Fork off a copy of this process for this set | |
| 908 } else { | |
| 909 $deceasedPID = wait(); # Wait until a PID is finished, and gather its number | |
| 910 $pid = fork(); # Fork off a copy of this process for this set | |
| 911 } | |
| 912 if ($pid != 0) { # If this is the parent process | |
| 913 push(@pids,$pid); # Add the new process ID to the list of active process IDs | |
| 914 } else { # If this is the new child process | |
| 915 $hmmPath = $profileDB . "/" . $set . ".hmm"; # Set the path to the HMM-file of the current set | |
| 916 if ($search_eval ne "") { # If E-value cutoff is use for the search | |
| 917 hmmerSearch("hmmscan --cpu $hmmcpu $heurMax -E $search_eval $hmmPath $tempDir/main.fasta","$tempDir/main.$set.hmmscan","M",$set); # Call HMMER with E-value cutoff | |
| 918 } else { # If score cutoff is use for the search | |
| 919 hmmerSearch("hmmscan --cpu $hmmcpu $heurMax -T $search_score $hmmPath $tempDir/main.fasta","$tempDir/main.$set.hmmscan","M",$set); # Call HMMER with score cutoff | |
| 920 } | |
| 921 ## Stop child process... | |
| 922 exit; # Exits the child process | |
| 923 } | |
| 924 } | |
| 925 ## Revese strand searches... | |
| 926 if ($complement == 1) { # If the complementary file should be scanned | |
| 927 foreach $set (@profileSet) { # Go through each profile set to investigate | |
| 928 if ($cpuCount < $cpu) { | |
| 929 $cpuCount++; | |
| 930 $pid = fork(); # Fork off a copy of this process for this set | |
| 931 } else { | |
| 932 $deceasedPID = wait(); # Wait until a PID is finished, and gather its number | |
| 933 $pid = fork(); # Fork off a copy of this process for this set | |
| 934 } | |
| 935 if ($pid != 0) { # If this is the parent process | |
| 936 push(@pids,$pid); # Add the new process ID to the list of active process IDs | |
| 937 } else { # If this is the new child process | |
| 938 $hmmPath = $profileDB . "/" . $set . ".hmm"; # Set the path to the HMM-file of the current set | |
| 939 if ($search_eval ne "") { # If E-value cutoff is use for the search | |
| 940 hmmerSearch("hmmscan --cpu $hmmcpu $heurMax -E $search_eval $hmmPath $tempDir/complement.fasta","$tempDir/complement.$set.hmmscan","C",$set); # Call HMMER with E-value cutoff | |
| 941 } else { # If score cutoff is use for the search | |
| 942 hmmerSearch("hmmscan --cpu $hmmcpu $heurMax -T $search_score $hmmPath $tempDir/complement.fasta","$tempDir/complement.$set.hmmscan","C",$set); # Call HMMER with score cutoff | |
| 943 } | |
| 944 ## Stop child process... | |
| 945 exit; # Exits the child process | |
| 946 } | |
| 947 } | |
| 948 } | |
| 949 ## Get the current time and output the active process IDs | |
| 950 $now = localtime; | |
| 951 #print STDERR " $now : Active PIDs: "; | |
| 952 #foreach $p (@pids) { # Go through the list of PIDs | |
| 953 # print STDERR "$p "; # Print the PID | |
| 954 #} | |
| 955 #print STDERR "\n"; # Print a new line | |
| 956 do { # Loop until all child PIDs have finished. | |
| 957 $deceasedPID = wait(); # Wait until a PID is finished, and gather its number | |
| 958 $now = localtime; # Get the current time | |
| 959 if ($deceasedPID > -1) { # If the PID that finished wasn't the last active one | |
| 960 # print STDERR " $now : PID $deceasedPID finished.\n"; # Print finished PID | |
| 961 } else { # If PID that finished was the last | |
| 962 print STDERR " $now : All processes finished.\n"; # Print that all PIDs have finished | |
| 963 } | |
| 964 } until (wait() == -1); # Do this loop until all PIDs have finished | |
| 965 $now = localtime; # Get current time | |
| 966 print STDERR "$now : Parallel HMM-scan finished.\n"; # Print informative finishing message | |
| 967 } | |
| 968 } else { # If a pre-computed hmmscan file is supplied then | |
| 969 ## Get the current time and output that the hmmscan step is skipped | |
| 970 $now = localtime; | |
| 971 if ($silent == 0) { | |
| 972 print STDERR "$now : Skipping hmmscan! Using $hmmscan as input for the analysis instead.\n"; | |
| 973 } | |
| 974 } | |
| 975 | |
| 976 ## Analyse HMM-scan output | |
| 977 ## Get the current time and output info | |
| 978 $now = localtime; | |
| 979 if ($silent == 0) { | |
| 980 print STDERR "$now : Analysing results of HMM-scan (this might take quite some time)...\n"; | |
| 981 } | |
| 982 | |
| 983 ## Set up output files | |
| 984 if ($out_table == 1) { # If table output is on | |
| 985 open (TABLE, ">$output.hmmer.table"); # Create a table output file | |
| 986 } | |
| 987 if ($out_graph == 1) { # If graphical output is on | |
| 988 open (GRAPH, ">$output.graph"); # Create a graph output file | |
| 989 } | |
| 990 if ($out_not == 1) { # If not-found output is on | |
| 991 open (NOTFOUND, ">$tempDir/$output\_hmmer_no_detections.txt"); # Create a HMMER not-found output file | |
| 992 } | |
| 993 | |
| 994 $setI = 0; # Set the profile set indicator to zero | |
| 995 foreach $set (@profileSet) { # Go through all the profile sets to be investigated | |
| 996 for ($co = 0; $co <= 1; $co++) { # Do main (and complementary) strand analysis in order | |
| 997 if ($co > 0) { # If main strand analysis is finished | |
| 998 if ($complement == 1) { # If complementary strand should be analysed | |
| 999 open (HMMOUTPUT, "$tempDir/complement.$set.hmmscan"); # Open hmmscan output for reading | |
| 1000 open (SEQUENCES, "$tempDir/complement.fasta"); # Open complementary sequence file for reading | |
| 1001 if ($out_table == 1) { # If table output is on, write a header for this set | |
| 1002 print TABLE "***********************************************************\n"; | |
| 1003 print TABLE "$set matches on complementary strand:\n"; | |
| 1004 } | |
| 1005 if ($out_graph == 1) { # If graphical output is on, write a header for this set | |
| 1006 print GRAPH "***********************************************************\n"; | |
| 1007 print GRAPH "$set matches on complementary strand:\n"; | |
| 1008 } | |
| 1009 } else { # If complementary strand should not be analysed | |
| 1010 last; # Exit this loop | |
| 1011 } | |
| 1012 } | |
| 1013 if ($co == 0) { # If main strand analysis is not finished | |
| 1014 open (HMMOUTPUT, "$tempDir/main.$set.hmmscan"); # Open hmmscan output for reading | |
| 1015 open (SEQUENCES, "$tempDir/main.fasta"); # Open main sequence file for reading | |
| 1016 if ($out_table == 1) { # If table output is on, write a header for this set | |
| 1017 print TABLE "***********************************************************\n"; | |
| 1018 print TABLE "$set matches on main strand:\n"; | |
| 1019 } | |
| 1020 if ($out_graph == 1) { # If graphical output is on, write a header for this set | |
| 1021 print GRAPH "***********************************************************\n"; | |
| 1022 print GRAPH "$set matches on main strand:\n"; | |
| 1023 } | |
| 1024 } | |
| 1025 ## Read and analyse hmmscan output file | |
| 1026 while ($line = <HMMOUTPUT>) { # Read in the hmmscan output file, line by line | |
| 1027 chomp($line); # Remove any potential line feeds | |
| 1028 if (substr($line,0,13) eq "## New query:") { # If this line begin with "## New query:", then this is a new entry | |
| 1029 undef %hits; # Empty the hits hash | |
| 1030 undef %evals; # Empty the e-value hash | |
| 1031 undef %scores; # Empty the score hash | |
| 1032 $querytemp = substr($line,14); # Extract everything from this line, except for the start ("## New query:") | |
| 1033 ($query,$length) = split('\t',$querytemp); # Split query name, length and DNA sequence | |
| 1034 if ($co == 0) { # If main strand analysis | |
| 1035 $DNA = $sequenceDB{"$query"}; | |
| 1036 } else { # If complementary strand | |
| 1037 $cseq = $sequenceDB{"$query"}; # Store the DNA sequence in the intermediate varaible $cseq | |
| 1038 $cseq =~ tr/[a-z]/[A-Z]/; # Make all letters uppercase | |
| 1039 $cseq =~ s/[^A-Z]//g; # Remove all non-alphabetic characters | |
| 1040 $cseq =~ tr/ACGTURYSWKMBDHVN/TGCAAYRWSMKVHDBN/; # Replace all characters with its complementary base | |
| 1041 $DNA = reverse($cseq); | |
| 1042 } | |
| 1043 } else { | |
| 1044 | |
| 1045 if ($line ne "//") { | |
| 1046 ## Find domain annotations... | |
| 1047 ($query,$matchProfile,$length,$domNo,$sign,$score,$bias,$cE,$iE,$hmmFrom,$hmmTo,$hmmends,$queryFrom,$queryTo,$queryends,$envFrom,$envTo,$envends,$acc) = split('\t',$line); # Split the line into a collection of stat variables | |
| 1048 | |
| 1049 $useQueryFrom = $queryFrom; | |
| 1050 if ($hmmFrom > 1) { # If the HMM-profile is not matched from the beginning | |
| 1051 $hmmDiff = $hmmFrom - 1; | |
| 1052 if ($useQueryFrom > $hmmDiff) { | |
| 1053 $useQueryFrom = $queryFrom - $hmmDiff; | |
| 1054 } else { | |
| 1055 $useQueryFrom = 1; | |
| 1056 } | |
| 1057 } | |
| 1058 | |
| 1059 if (uc($anchor) eq "HMM") { | |
| 1060 $anchorLen = $hmmTo - $hmmFrom + 1; | |
| 1061 } else { | |
| 1062 $anchorLen = $anchor; | |
| 1063 } | |
| 1064 | |
| 1065 | |
| 1066 $query_profile_match = $query . ":" . $matchProfile; | |
| 1067 $profileExists = 0; # Assume that the newly found match profile ($matchProfile) is not already found for this sequence | |
| 1068 if (exists($hits{$query_profile_match})) { # If a profile from the list is the same as the match profile | |
| 1069 ($hitFrom,$hitTo,$hitProfile,$hitScore,$hitE) = split('\t',$hits{$query_profile_match}); # Split the entry in list into stat variables | |
| 1070 if ($iE < $hitE) { # If the new match profile has a smaller E-value than the one from the list | |
| 1071 $hits{$query_profile_match} = "$useQueryFrom\t$envTo\t$matchProfile\t$score\t$iE\t$anchorLen"; # Replace the data in the hit list with the data for the newly found match profile | |
| 1072 $evals{$query_profile_match} = $iE; # Replace the E-value in the hit list with the E-value for the newly found match profile | |
| 1073 $scores{$query_profile_match} = $score; # Replace the score in the hit list with the score for the newly found match profile | |
| 1074 } | |
| 1075 $profileExists = 1; # Indicate that this match profile was found in the hit list | |
| 1076 } | |
| 1077 if ($profileExists == 0) { # If the match profile was not found in the hit list | |
| 1078 if (($iE <= $E) && ($score >= $S)) { # If this hits lives up to the minimal score and E-value cutoffs | |
| 1079 $hits{$query_profile_match} = "$useQueryFrom\t$envTo\t$matchProfile\t$score\t$iE\t$anchorLen"; # Add the data for the newly found match profile | |
| 1080 $evals{$query_profile_match} = $iE; # Add the E-value for the newly found match profile | |
| 1081 $scores{$query_profile_match} = $score; # Add the score for the newly found match profile | |
| 1082 } | |
| 1083 } | |
| 1084 } else { # If the line only contains "//", the end of this sequence's hmmscan entry is reached | |
| 1085 ## Save analysis results | |
| 1086 @sortedKeys = sort {$hits{$a} <=> $hits{$b}} keys(%hits); # Sort the the list of hits numerically ascending (smallest first) | |
| 1087 undef @sortedHits; | |
| 1088 undef @scores; | |
| 1089 undef @evals; | |
| 1090 foreach $key (@sortedKeys) { | |
| 1091 push(@sortedHits, $hits{$key}); # Add the hit to the list of hits numerically ascending (smallest first) | |
| 1092 push(@scores, $scores{$key}); # Add the score to the scores array | |
| 1093 push(@evals, $evals{$key}); # Add the E-value to the evals array | |
| 1094 } | |
| 1095 | |
| 1096 ## If the number of hits > N, the min eval < E and the max score > S then include query sequence | |
| 1097 ## OR if a single domain satisfies the thresholds and this is allowed, include it! | |
| 1098 if ( ((scalar(@sortedHits) >= $N) && (min(@evals) <= $E) && (max(@scores) >= $S)) || | |
| 1099 ((scalar(@sortedHits) > 0) && ($allow_single_E >= 0) && (min(@evals) <= $allow_single_E) && (max(@scores) >= $allow_single_score)) ) { | |
| 1100 if ($debug == 1) { # If debugging mode is on | |
| 1101 print STDERR $query . " :\t" . scalar(@sortedHits) . "\t" . min(@evals) . "\t" . max(@scores) . "\n"; # Print some top hit statistics | |
| 1102 } | |
| 1103 | |
| 1104 ## Save some total stats to be able to determine origin of ITS sequence | |
| 1105 if (scalar(@evals) > 0) { # If there are any E-values stored | |
| 1106 $averageE = sum(@evals) / scalar(@evals); # Calculate the average E-value for this profile set | |
| 1107 $averageScore = sum(@scores) / scalar(@scores); # Calculate the average score for this profile set | |
| 1108 $numberOfDomains = scalar(@sortedHits); # Calculate the number of domains matched on this sequence | |
| 1109 | |
| 1110 #$scoreSum = sum(@scores) / @modelCount[$setI]; # Calculate score sum as: sum / (no. of profiles of this given type) | |
| 1111 $scoreSum = sum(@scores) / 4; # Calculate score sum as: sum / (no. of profiles of this given type) | |
| 1112 | |
| 1113 $saveThis = "$query\t$set\t$co\t$numberOfDomains\t$averageE\t$averageScore\t$scoreSum\t$DNA\t"; # Collect the variables to save for this sequence and this profile set | |
| 1114 foreach $hit (@sortedHits) { # Go through the list of hits and add specific information to save from each hit | |
| 1115 ($hitFrom,$hitTo,$hitProfile,$hitScore,$hitE,$hitanchorlen) = split('\t',$hit); # Extract information from this hit | |
| 1116 $saveThis = $saveThis . "$hitFrom;$hitTo;$hitProfile;$hitScore;$hitE;$hitanchorlen\t"; # Add information to the list of variables to save | |
| 1117 } | |
| 1118 #push(@allHits, $saveThis); # Add this information to the collection of all hits for this sequence, across all profile sets | |
| 1119 if (exists($allHits{$query})) { | |
| 1120 $allHits{$query} = $allHits{$query} . "\n" . $saveThis; | |
| 1121 } else { | |
| 1122 $allHits{$query} = $saveThis; | |
| 1123 } | |
| 1124 } | |
| 1125 | |
| 1126 if ($out_table == 1) { # If table output is on | |
| 1127 print TABLE $query . "\t" . $length . "\t"; # Print query and length information to table | |
| 1128 foreach $hit (@sortedHits) { # Go through each hit in the hit list | |
| 1129 ($hitFrom,$hitTo,$hitProfile,$hitScore,$hitE,$hitanchorlen) = split('\t',$hit); # Extract data corresponding to this hit | |
| 1130 print TABLE "$hitFrom - $hitTo: $hitProfile ($hitScore, $hitE)\t"; # Print hit information to table | |
| 1131 } | |
| 1132 print TABLE "\n"; # Print new line | |
| 1133 } | |
| 1134 | |
| 1135 if ($out_graph == 1) { # If graphical output is on | |
| 1136 print GRAPH ">> " . $query . "\t" . $length . " bp\n"; # Print a sequence header | |
| 1137 $insertPoint = 0; # Set the domain insert point to beginning of line | |
| 1138 $hi = 0; # Set hit number to zero | |
| 1139 foreach $hit (@sortedHits) { # Go through the hit list | |
| 1140 ($hitFrom,$hitTo,$hitProfile,$hitScore,$hitE,$anchorLen) = split('\t',$hit); # Split the hit into stat variables | |
| 1141 if ($graph_scale == 0) { # If the graph scale is scaled individually to 100% for each sequence | |
| 1142 $pFrom = $hitFrom / $length * 100; # Set the profile start on graph relative to its position in the sequence | |
| 1143 $pTo = $hitTo / $length * 100; # Set the profile end on graph relative to its position in the sequence | |
| 1144 $pEnd = 100; # Set the end of the sequence graph to be at 100 | |
| 1145 } else { # If the scale is the same for all sequences | |
| 1146 $pFrom = $hitFrom * $graph_scale; # Set the profile start on graph scaled to the parameter given | |
| 1147 $pTo = $hitTo * $graph_scale; # Set the profile end on graph scaled to the parameter given | |
| 1148 $pEnd = $length * $graph_scale; # Set the end of the sequence graph to be at the end of the sequence scaled to the parameter given | |
| 1149 } | |
| 1150 for ($insertPoint = $insertPoint; $insertPoint <= $pFrom; $insertPoint++) { # Go forward through the sequence, moving the insert point one step at a time until the beginning of the next profile is reached | |
| 1151 print GRAPH "-"; # Print a "-" | |
| 1152 } | |
| 1153 print GRAPH substr($hitProfile,2,3); # When the profile is reached, print its name | |
| 1154 $insertPoint = $insertPoint + 3; # Move the insert point three steps forward, to account for the inserted name | |
| 1155 ($nextHitStart,$nextHitEnd,$nextProfile) = split('\t',@sortedHits[$hi + 1]); # Check where the next hit in the list is located | |
| 1156 if (($nextHitStart <= $hitTo) && ($nextHitStart > 0)) { # If the next hit in the list overlaps with this profile | |
| 1157 if ($graph_scale == 0) { # If the scale is relative | |
| 1158 $pTo = $nextHitStart / $length * 100 - 1; # Change the profile end on the graph to be where this next profile starts | |
| 1159 } else { # If the scale is the same for all sequences | |
| 1160 $pTo = $nextHitStart * $graph_scale - 1; # Change the profile end on the graph to be where this next profile starts | |
| 1161 } | |
| 1162 } | |
| 1163 for ($insertPoint = $insertPoint; $insertPoint <= $pTo; $insertPoint++) { # Go forward through the sequence, moving the insert point one step at a time until the end of the current profile is reached | |
| 1164 print GRAPH "="; # Print a "=" | |
| 1165 } | |
| 1166 if (($nextHitStart <= $hitTo) && ($nextHitStart > 0)) { # If the next hit in the list overlaps with this profile | |
| 1167 print GRAPH ">"; # Print a ">" to indicate the profile overlap | |
| 1168 $insertPoint++; # Move the insert point one additional step forward to account for the ">" inserted | |
| 1169 } | |
| 1170 $hi++; # Increase the hit number by one | |
| 1171 } | |
| 1172 for ($insertPoint = $insertPoint; $insertPoint <= $pEnd; $insertPoint++) { # If there is no more profile matches to sequence, go forward through the sequence, moving the insert point one step at a time until the end of the sequence is reached | |
| 1173 print GRAPH "-"; # Print a "-" | |
| 1174 } | |
| 1175 print GRAPH "\n"; # Print a new line, indicating the end of this sequence entry | |
| 1176 } | |
| 1177 } else { # If this sequence didn't find any good-enough profile matches | |
| 1178 if ($out_not == 1) { # If not-found output is on | |
| 1179 print NOTFOUND $query . "\n"; # Print the name of this query to the not-found list | |
| 1180 } | |
| 1181 } | |
| 1182 } | |
| 1183 } | |
| 1184 } | |
| 1185 close (SEQUENCES); # Close the input sequence file | |
| 1186 close (HMMOUTPUT); # Close the hmmscan output file | |
| 1187 } | |
| 1188 $setI++; # Add one to the profile set indicator | |
| 1189 } | |
| 1190 | |
| 1191 ## Close output files | |
| 1192 if ($out_table == 1) { # If table output is on, close the table file | |
| 1193 close (TABLE); | |
| 1194 } | |
| 1195 if ($out_graph == 1) { # If graphical output is on, close the graph file | |
| 1196 close (GRAPH); | |
| 1197 } | |
| 1198 if ($out_not == 1) { # If not-found output is on, close the not-found file | |
| 1199 close (NOTFOUND); | |
| 1200 #$profileCount = scalar(@profileSet); # Count the number of profile sets | |
| 1201 #if ($complement == 1) { # If complementary strand was scanned | |
| 1202 # $profileCount = $profileCount * 2; # Double the number of profile sets that was investigated (and thus the number of not-founds that could at max be found) | |
| 1203 #} | |
| 1204 #`sort $tempDir/hmmer_no_detections.txt | uniq -c | grep " *$profileCount " | sed "s/ *$profileCount //" > $output\_no_detections.txt`; # Sort the not-found list, count the number of profile sets having no matches for each query. Save those that have only non-matches to the hmmer-not-found file | |
| 1205 } | |
| 1206 | |
| 1207 | |
| 1208 ## Create total collected output and FASTA output | |
| 1209 | |
| 1210 if ($out_results == 1) { | |
| 1211 open (RESULTS, ">$output.extraction.results"); # Create a results file | |
| 1212 } | |
| 1213 open (RAWOUT, ">$tempDir/ITSx_output.raw"); # Create a raw output file for ALL data | |
| 1214 open (PROBLEM, ">$output.problematic.txt"); # Create a file for problematic entries | |
| 1215 $foundProblem = 0; | |
| 1216 if ($out_pos == 1) { | |
| 1217 open (POS, ">$output.positions.txt"); # Create a positions file | |
| 1218 } | |
| 1219 if ($out_fasta == 1) { # If FASTA output should be written | |
| 1220 open (FASTA, ">$output.full.fasta"); # Create a FASTA output file for found sequences | |
| 1221 if ($allow_reorder == 0) { # If reordering of domains is not allowed | |
| 1222 open (CHIMERA, ">$output.chimeric.fasta"); # Create a FASTA file for potential chimera sequences with profile matches in the wrong order | |
| 1223 $foundChimera = 0; | |
| 1224 } | |
| 1225 if ($out_partial > 0) { | |
| 1226 open (FULLPARTIAL, ">$output.full_and_partial.fasta"); | |
| 1227 } | |
| 1228 } | |
| 1229 | |
| 1230 if ($out_joined == 1) { # If SSU FASTA output should be written | |
| 1231 open (JOINED, ">$output.joined.fasta"); | |
| 1232 } | |
| 1233 if ($out_ssu == 1) { # If SSU FASTA output should be written | |
| 1234 open (SSU, ">$output.SSU.fasta"); | |
| 1235 } | |
| 1236 if ($out_lsu == 1) { # If LSU FASTA output should be written | |
| 1237 open (LSU, ">$output.LSU.fasta"); | |
| 1238 } | |
| 1239 if ($out_58S == 1) { # If 5.8S FASTA output should be written | |
| 1240 open (MID, ">$output.5_8S.fasta"); | |
| 1241 } | |
| 1242 if ($out_its1 == 1) { # If ITS1 FASTA output should be written | |
| 1243 open (ITS1, ">$output.ITS1.fasta"); | |
| 1244 if ($out_partial > 0) { | |
| 1245 open (ITS1PARTIAL, ">$output.ITS1.full_and_partial.fasta"); | |
| 1246 } | |
| 1247 } | |
| 1248 if ($out_its2 == 1) { # If ITS2 FASTA output should be written | |
| 1249 open (ITS2, ">$output.ITS2.fasta"); | |
| 1250 if ($out_partial > 0) { | |
| 1251 open (ITS2PARTIAL, ">$output.ITS2.full_and_partial.fasta"); | |
| 1252 } | |
| 1253 } | |
| 1254 if ($out_concat == 1) { # If concatenated output should be written | |
| 1255 open (CONCAT, ">$output.concat.fasta"); | |
| 1256 } | |
| 1257 | |
| 1258 undef @sortedHits; # Empty the array of sorted hits | |
| 1259 | |
| 1260 # @sortedHits = sort @allHits; # Sort the full list of hits in alphabetical order (to be able to analyse all sequences with same ID at once) | |
| 1261 if ($out_not == 1) { # If not-found output is on | |
| 1262 open (NOTFOUND, ">$output\_no_detections.txt"); # Create a not-found output file | |
| 1263 $noDetect = 0; | |
| 1264 } | |
| 1265 foreach $sequenceID (@sequenceOrder) { # Sort the full list of hits in their original order (to be able to analyse all sequences with same ID at once) | |
| 1266 if ($sequenceID ne "") { | |
| 1267 $countsInList = 0; | |
| 1268 if (exists($allHits{$sequenceID})) { | |
| 1269 @allHits = split('\n',$allHits{$sequenceID}); | |
| 1270 } else { | |
| 1271 undef @allHits; | |
| 1272 } | |
| 1273 foreach $line (@allHits) { | |
| 1274 @item = split('\t',$line); # Split the line into an array | |
| 1275 if (@item[0] eq $sequenceID) { # If this item corresponds to the current sequence ID | |
| 1276 push(@sortedHits,$line); # Add it to the sorted list of hits | |
| 1277 $countsInList++; | |
| 1278 } | |
| 1279 } | |
| 1280 if ($countsInList == 0) { # If no matches were found | |
| 1281 if ($out_not == 1) { # If not-found output is on | |
| 1282 print NOTFOUND "$sequenceID\n"; # Output the sequence ID | |
| 1283 $noDetect++; | |
| 1284 } | |
| 1285 } | |
| 1286 } | |
| 1287 } | |
| 1288 if ($out_not == 1) { # If not-found output is on | |
| 1289 close (NOTFOUND); | |
| 1290 if ($noDetect == 0) { | |
| 1291 `rm $output\_no_detections.txt 2> /dev/null`; | |
| 1292 } | |
| 1293 } | |
| 1294 | |
| 1295 ## Set all counts for different ITS types to zero | |
| 1296 undef @itsCounts; | |
| 1297 $itsChimeric = 0; | |
| 1298 $itsMain = 0; | |
| 1299 $itsCompl = 0; | |
| 1300 | |
| 1301 push(@sortedHits,"--END--"); # Add a last item to the sorted list, so that all items are securely saved | |
| 1302 $lc = 1; # Set the line count to one | |
| 1303 foreach $line (@sortedHits) { # Go through the list of found hits | |
| 1304 print RAWOUT "$line\n"; # Write the raw data associated with this hit to the raw data output file | |
| 1305 @item = split('\t',$line); # Split the line into an array | |
| 1306 ## If this sequence ID is the same as the saved ones, then add it, else empty the array sequence ID and save | |
| 1307 if ((@seqID[0] ne @item[0]) && (@item[0] ne "") || ($lc > scalar(@sortedHits))) { # If this sequence ID is not the same as the last one and is non-empty, or if the end of the list has been reached | |
| 1308 ## Save profile-type which is most likely... | |
| 1309 if ($priority eq "sum") { # If the sum-of-scores algorithm should be used to determine the most likely profile-type | |
| 1310 ## Reset variables to unrealisticly high or low values | |
| 1311 $best = 0; | |
| 1312 $bestCount = 0; | |
| 1313 $bestEval = 1000; | |
| 1314 $bestScore = -1000; | |
| 1315 $bestSum = -1000; | |
| 1316 | |
| 1317 for ($i = 0; $i < scalar(@seqScoreSum); $i++) { # Go through all sum-of-scores entries | |
| 1318 if (@seqScoreSum[$i] > $bestSum) { # If the current value is larger than the previous top value | |
| 1319 $best = $i; # Set the best value to be the current value | |
| 1320 ## Set all other best variables to those corresponding to the current value | |
| 1321 $bestCount = @seqDomCounts[$i]; | |
| 1322 $bestEval = @seqAvgE[$i]; | |
| 1323 $bestScore = @seqAvgScore[$i]; | |
| 1324 $bestSum = @seqScoreSum[$i]; | |
| 1325 } | |
| 1326 if (@seqScoreSum[$i] == $bestSum) { # If the current value is equal to the previous top value | |
| 1327 if (@seqDomCounts[$i] > $bestCount) { # If the current domain count is larger than the previous top domain count | |
| 1328 $best = $i; # Set the best value to be the current value | |
| 1329 ## Set all other best variables to those corresponding to the current value | |
| 1330 $bestCount = @seqDomCounts[$i]; | |
| 1331 $bestEval = @seqAvgE[$i]; | |
| 1332 $bestScore = @seqAvgScore[$i]; | |
| 1333 $bestSum = @seqScoreSum[$i]; | |
| 1334 } | |
| 1335 if (@seqDomCounts[$i] == $bestCount) { # If the current domain count is equal to the previous top domain count | |
| 1336 if (@seqAvgE[$i] < $bestEval) { # If the current E-value is smaller than the previous top E-value | |
| 1337 $best = $i; # Set the best value to be the current value | |
| 1338 ## Set all other best variables to those corresponding to the current value | |
| 1339 $bestCount = @seqDomCounts[$i]; | |
| 1340 $bestEval = @seqAvgE[$i]; | |
| 1341 $bestScore = @seqAvgScore[$i]; | |
| 1342 $bestSum = @seqScoreSum[$i]; | |
| 1343 } | |
| 1344 } | |
| 1345 } | |
| 1346 } | |
| 1347 } | |
| 1348 if ($priority eq "domains") { # If the number of found domains should be used to determine the most likely profile-type | |
| 1349 ## Reset variables to unrealisticly high or low values | |
| 1350 $best = 0; | |
| 1351 $bestCount = 0; | |
| 1352 $bestEval = 1000; | |
| 1353 $bestScore = -1000; | |
| 1354 | |
| 1355 for ($i = 0; $i < scalar(@seqDomCounts); $i++) { # Go through all domain count entries | |
| 1356 if (@seqDomCounts[$i] > $bestCount) { # If the current domain count is larger than the previous top domain count | |
| 1357 $best = $i; # Set the best value to be the current value | |
| 1358 ## Set all other best variables to those corresponding to the current value | |
| 1359 $bestCount = @seqDomCounts[$i]; | |
| 1360 $bestEval = @seqAvgE[$i]; | |
| 1361 $bestScore = @seqAvgScore[$i]; | |
| 1362 } | |
| 1363 if (@seqDomCounts[$i] == $bestCount) { # If the current domain count is equal to the previous top domain count | |
| 1364 if (@seqAvgE[$i] < $bestEval) { # If the current E-value is smaller than the previous top E-value | |
| 1365 $best = $i; # Set the best value to be the current value | |
| 1366 ## Set all other best variables to those corresponding to the current value | |
| 1367 $bestCount = @seqDomCounts[$i]; | |
| 1368 $bestEval = @seqAvgE[$i]; | |
| 1369 $bestScore = @seqAvgScore[$i]; | |
| 1370 } | |
| 1371 if (@seqAvgE[$i] == $bestEval) { # If the current E-value is equal to the previous top E-value | |
| 1372 if (@seqAvgScore[$i] > $bestScore) { # If the current average score is larger than the previous top average score | |
| 1373 $best = $i; # Set the best value to be the current value | |
| 1374 ## Set all other best variables to those corresponding to the current value | |
| 1375 $bestCount = @seqDomCounts[$i]; | |
| 1376 $bestEval = @seqAvgE[$i]; | |
| 1377 $bestScore = @seqAvgScore[$i]; | |
| 1378 } | |
| 1379 } | |
| 1380 } | |
| 1381 } | |
| 1382 } | |
| 1383 if ($priority eq "eval") { # If the average E-value should be used to determine the most likely profile-type | |
| 1384 ## Reset variables to unrealisticly high or low values | |
| 1385 $best = 0; | |
| 1386 $bestCount = 0; | |
| 1387 $bestEval = 1000; | |
| 1388 $bestScore = -1000; | |
| 1389 | |
| 1390 for ($i = 0; $i < scalar(@seqDomCounts); $i++) { # Go through all domain counts entries | |
| 1391 if (@seqAvgE[$i] < $bestEval) { # If the current E-value is smaller than the previous top E-value | |
| 1392 $best = $i; # Set the best value to be the current value | |
| 1393 ## Set all other best variables to those corresponding to the current value | |
| 1394 $bestCount = @seqDomCounts[$i]; | |
| 1395 $bestEval = @seqAvgE[$i]; | |
| 1396 $bestScore = @seqAvgScore[$i]; | |
| 1397 } | |
| 1398 if (@seqAvgE[$i] == $bestEval) { # If the current E-value is equal to the previous top E-value | |
| 1399 if (@seqAvgScore[$i] > $bestScore) { # If the current average score is larger than the previous top average score | |
| 1400 $best = $i; # Set the best value to be the current value | |
| 1401 ## Set all other best variables to those corresponding to the current value | |
| 1402 $bestCount = @seqDomCounts[$i]; | |
| 1403 $bestEval = @seqAvgE[$i]; | |
| 1404 $bestScore = @seqAvgScore[$i]; | |
| 1405 } | |
| 1406 if (@seqAvgScore[$i] == $bestScore) { # If the current average score is equal to the previous top average score | |
| 1407 if (@seqDomCounts[$i] > $bestCount) { # If the current number of domains is larger than the previous top number of domains | |
| 1408 $best = $i; # Set the best value to be the current value | |
| 1409 ## Set all other best variables to those corresponding to the current value | |
| 1410 $bestCount = @seqDomCounts[$i]; | |
| 1411 $bestEval = @seqAvgE[$i]; | |
| 1412 $bestScore = @seqAvgScore[$i]; | |
| 1413 } | |
| 1414 } | |
| 1415 } | |
| 1416 } | |
| 1417 } | |
| 1418 if ($priority eq "score") { # If the average score should be used to determine the most likely profile-type | |
| 1419 ## Reset variables to unrealisticly high or low values | |
| 1420 $best = 0; | |
| 1421 $bestCount = 0; | |
| 1422 $bestEval = 1000; | |
| 1423 $bestScore = -1000; | |
| 1424 | |
| 1425 for ($i = 0; $i < scalar(@seqDomCounts); $i++) { # Go through all domain counts entries | |
| 1426 if (@seqAvgScore[$i] > $bestScore) { # If the current average score is larger than the previous top average score | |
| 1427 $best = $i; # Set the best value to be the current value | |
| 1428 ## Set all other best variables to those corresponding to the current value | |
| 1429 $bestCount = @seqDomCounts[$i]; | |
| 1430 $bestEval = @seqAvgE[$i]; | |
| 1431 $bestScore = @seqAvgScore[$i]; | |
| 1432 } | |
| 1433 if (@seqAvgScore[$i] == $bestScore) { # If the current average score is equal to the previous top average score | |
| 1434 if (@seqAvgE[$i] < $bestEval) { # If the current E-value is smaller than the previous top E-value | |
| 1435 $best = $i; # Set the best value to be the current value | |
| 1436 ## Set all other best variables to those corresponding to the current value | |
| 1437 $bestCount = @seqDomCounts[$i]; | |
| 1438 $bestEval = @seqAvgE[$i]; | |
| 1439 $bestScore = @seqAvgScore[$i]; | |
| 1440 } | |
| 1441 if (@seqAvgE[$i] == $bestEval) { # If the current E-value is equal to the previous top E-value | |
| 1442 if (@seqDomCounts[$i] > $bestCount) { # If the current number of domains is larger than the previous top number of domains | |
| 1443 $best = $i; # Set the best value to be the current value | |
| 1444 ## Set all other best variables to those corresponding to the current value | |
| 1445 $bestCount = @seqDomCounts[$i]; | |
| 1446 $bestEval = @seqAvgE[$i]; | |
| 1447 $bestScore = @seqAvgScore[$i]; | |
| 1448 } | |
| 1449 } | |
| 1450 } | |
| 1451 } | |
| 1452 } | |
| 1453 | |
| 1454 if (@seqID[$best] ne "") { # If the sequence ID of the most likely profile is not empty | |
| 1455 | |
| 1456 $allanchorLens = @anchorLens[$best]; # Get the best anchor lengths | |
| 1457 @allanchorLens = split(',', $allanchorLens); # Split the anchor lens into an array | |
| 1458 | |
| 1459 $chimeric = 0; # Assume the sequence is not chimeric | |
| 1460 if ($allow_reorder == 0) { # If re-order of domain is not allowed | |
| 1461 $domain_order = @allSeqDomains[$best]; # Gather the order the domains are found in | |
| 1462 @domain_order = split(' ',$domain_order); # Split the list into an array | |
| 1463 @sorted_domain_order = sort {$a cmp $b} @domain_order; # Sort the array alphabetically | |
| 1464 for ($di = 0; $di <= scalar(@domain_order); $di++) { # Go through the sorted array | |
| 1465 if ((@domain_order[$di] ne @sorted_domain_order[$di]) || (@problemCode[$best] =~ m/C/)) { # Check if the order of the arrays differ at any poiny | |
| 1466 $chimeric = 1; # If they do differ, mark the sequence as chimeric | |
| 1467 } | |
| 1468 } | |
| 1469 } | |
| 1470 | |
| 1471 $seqDNALength = length(@seqDNA[$best]); # Get the length of the DNA sequence | |
| 1472 | |
| 1473 ## Print sequence and match data... | |
| 1474 ## Order of columns in the output file: | |
| 1475 ## ID Length Type Main/Compl Domains Avg.Eval Avg.Score Start End Start_domain End_domain Chimeric | |
| 1476 if ($out_results == 1) { | |
| 1477 print RESULTS @seqID[$best] . "\t" . $seqDNALength . "\t" . @seqITSType[$best] . "\t" . @seqCompl[$best] . "\t" . @seqDomCounts[$best] . "\t" . @seqAvgE[$best] . "\t" . @seqAvgScore[$best] . "\t" . @seqScoreSum[$best] . "\t" . @dnaStart[$best] . "\t" . @dnaEnd[$best] . "\t" . @startDomain[$best] . "\t" . @endDomain[$best] . "\t"; # Print sequence and match data to the results file | |
| 1478 if ($chimeric == 1) { # If the sequence was regarded chimeric | |
| 1479 print RESULTS "Chimeric\t"; # Add a chimeric tag to the entry | |
| 1480 } else { # If not chimeric | |
| 1481 print RESULTS "\t"; # Add an empty column | |
| 1482 } | |
| 1483 $allDomains = @allSeqDomains[$best]; # Get the domain order of the entry | |
| 1484 $allDomains =~ tr/ /,/; # Replace spaces with commas in the domain order string | |
| 1485 $allDomains = substr($allDomains,0,length($allDomains) - 1); # Remove the last character (a comma) | |
| 1486 print RESULTS $allDomains; # Write the domain order to the results file | |
| 1487 print RESULTS "\t"; # Write a tab to the results file | |
| 1488 } | |
| 1489 | |
| 1490 if ($out_pos == 1) { | |
| 1491 $out_all_pos = 1; | |
| 1492 if ($out_all_pos == 1) { # Output positions of all domains | |
| 1493 $seqPartLen = @dnaEnd[$best] - @dnaStart[$best] + 1; | |
| 1494 if ($seqPartLen < 0) { | |
| 1495 $seqPartLen = $seqPartLen * -1; | |
| 1496 } | |
| 1497 ## Print the positions of all identified domains to the position file | |
| 1498 print POS @seqID[$best] . "\t" . $seqDNALength . " bp." . "\t"; | |
| 1499 if (@problemCode[$best] !~ m/S/) { | |
| 1500 print POS "SSU: " . @ssuStart[$best] . "-" . @ssuEnd[$best] . "\t"; | |
| 1501 } else { | |
| 1502 print POS "SSU: Not found\t"; | |
| 1503 } | |
| 1504 if ((@problemCode[$best] =~ m/X/) || ((@problemCode[$best] =~ m/[15]/) && (@problemCode[$best] =~ m/S/))) { | |
| 1505 print POS "ITS1: Not found\t"; | |
| 1506 } else { | |
| 1507 if (@problemCode[$best] =~ m/O/) { | |
| 1508 print POS "ITS1: " . (@ssuEnd[$best] + 1) . "-" . (@ssuEnd[$best] + 1) . "\t"; | |
| 1509 } else { | |
| 1510 print POS "ITS1: " . @its1Start[$best] . "-" . @its1End[$best] . "\t"; | |
| 1511 } | |
| 1512 } | |
| 1513 if (@problemCode[$best] !~ m/[125OP]/) { | |
| 1514 print POS "5.8S: " . @midStart[$best] . "-" . @midEnd[$best] . "\t"; | |
| 1515 } else { | |
| 1516 if (@problemCode[$best] =~ m/5/) { | |
| 1517 print POS "5.8S: Not found\t"; | |
| 1518 } else { | |
| 1519 if (@problemCode[$best] =~ m/1/) { | |
| 1520 print POS "5.8S: No start\t"; | |
| 1521 } else { | |
| 1522 if (@problemCode[$best] =~ m/O/) { | |
| 1523 print POS "5.8S: Overlap SSU\t"; | |
| 1524 } else { | |
| 1525 if (@problemCode[$best] =~ m/P/) { | |
| 1526 print POS "5.8S: Overlap LSU\t"; | |
| 1527 } else { | |
| 1528 print POS "5.8S: No end\t"; | |
| 1529 } | |
| 1530 } | |
| 1531 } | |
| 1532 } | |
| 1533 } | |
| 1534 if ((@problemCode[$best] =~ m/Y/) ||((@problemCode[$best] =~ m/[25]/) && (@problemCode[$best] =~ m/L/))) { | |
| 1535 print POS "ITS2: Not found\t"; | |
| 1536 } else { | |
| 1537 if (@problemCode[$best] =~ m/O/) { | |
| 1538 print POS "ITS2: " . (@lsuStart[$best] - 1) . "-" . (@lsuStart[$best] - 1) . "\t"; | |
| 1539 } else { | |
| 1540 print POS "ITS2: " . @its2Start[$best] . "-" . @its2End[$best] . "\t"; | |
| 1541 } | |
| 1542 } | |
| 1543 if (@problemCode[$best] !~ m/L/) { | |
| 1544 print POS "LSU: " . @lsuStart[$best] . "-" . @lsuEnd[$best] . "\t"; | |
| 1545 } else { | |
| 1546 print POS "LSU: Not found" . "\t"; | |
| 1547 } | |
| 1548 | |
| 1549 if (@problemCode[$best] =~ m/5/) { | |
| 1550 print POS "Broken or partial sequence, no 5.8S! "; | |
| 1551 } else { | |
| 1552 if (@problemCode[$best] =~ m/[12]/) { | |
| 1553 print POS "Broken or partial sequence, only partial 5.8S! "; | |
| 1554 } | |
| 1555 } | |
| 1556 if (@problemCode[$best] =~ m/B/) { | |
| 1557 print POS "ITS region too long! "; | |
| 1558 } | |
| 1559 if (@problemCode[$best] =~ m/O/) { | |
| 1560 print POS "5.8S seem to overlap with SSU! "; | |
| 1561 } | |
| 1562 if (@problemCode[$best] =~ m/P/) { | |
| 1563 print POS "5.8S seem to overlap with LSU! "; | |
| 1564 } | |
| 1565 if (@problemCode[$best] =~ m/C/) { | |
| 1566 print POS "Chimeric! "; | |
| 1567 } | |
| 1568 print POS "\n"; | |
| 1569 | |
| 1570 } else { # Output only ITS positions | |
| 1571 $seqPartLen = @dnaEnd[$best] - @dnaStart[$best] + 1; | |
| 1572 print POS @seqID[$best] . "\t" . $seqDNALength . " bp." . "\t" . "ITS1: " . @its1Start[$best] . "-" . @its1End[$best] . "\t" . "ITS2: " . @its2Start[$best] . "-" . @its2End[$best] . "\n"; # Print the positions of the ITS sequences to the position file | |
| 1573 } | |
| 1574 } | |
| 1575 | |
| 1576 if (@problem[$best] ne "") { | |
| 1577 $foundProblem++; | |
| 1578 print PROBLEM @seqID[$best] . "\t" . @problem[$best] . "\n"; | |
| 1579 } | |
| 1580 | |
| 1581 ## Set extended type string (the string going into the definition line of the FASTA file) | |
| 1582 $extendedType = $profileIndex{@seqITSType[$best]} . " ITS sequence"; | |
| 1583 @itsCounts[ord(@seqITSType[$best])]++; # Add one ITS to the appropriate counter | |
| 1584 | |
| 1585 if (@seqCompl[$best] == 1) { # If domains were found on complementary strand | |
| 1586 $extendedStrand = "complementary strand"; # Set the strand string to complementary | |
| 1587 $itsCompl++; # Add one to the complementary strand counter | |
| 1588 } else { # If domains were found on the main strand | |
| 1589 $extendedStrand = "main strand"; # Set the strand string to main | |
| 1590 $itsMain++; # Add one to the main strand counter | |
| 1591 } | |
| 1592 | |
| 1593 ## Print (extracted) ITS sequence... | |
| 1594 if ($out_fasta == 1) { # If FASTA-output is on | |
| 1595 if ($truncate == 0) { # If the whole sequence should be kept in output file | |
| 1596 if ($chimeric == 0) { # If the sequence is not chimeric | |
| 1597 if (@problemCode[$best] !~ m/[SL]/) { | |
| 1598 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1599 print FASTA $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1600 } else { | |
| 1601 print FASTA ">" . @seqID[$best] . "|" . @seqITSType[$best] ." " . $extendedType . " (" . $seqDNALength . " bp) on " . $extendedStrand . "\n"; # Write FASTA definition line | |
| 1602 } | |
| 1603 print FASTA @seqDNA[$best] . "\n"; # Write DNA sequence | |
| 1604 } | |
| 1605 if ($out_partial > 0) { | |
| 1606 if (@problemCode[$best] !~ m/[SL125]/) { | |
| 1607 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1608 print FULLPARTIAL $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1609 } else { | |
| 1610 print FULLPARTIAL ">" . @seqID[$best] . "|" . @seqITSType[$best] ." " . $extendedType . " (" . $seqDNALength . " bp) Full ITS region on " . $extendedStrand . "\n"; # Write FASTA defline | |
| 1611 } | |
| 1612 print FULLPARTIAL @seqDNA[$best] . "\n"; # Write DNA sequence | |
| 1613 } else { | |
| 1614 $its1PartLen = @its1End[$best] - @its1Start[$best] + 1; | |
| 1615 $its2PartLen = @its2End[$best] - @its2Start[$best] + 1; | |
| 1616 if (($out_partial < $its1PartLen) && ($out_partial < $its2PartLen)) { | |
| 1617 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1618 print FULLPARTIAL $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1619 } else { | |
| 1620 print FULLPARTIAL ">" . @seqID[$best] . "|" . @seqITSType[$best] ." " . $extendedType . " (" . $seqDNALength . " bp) Partial ITS region on " . $extendedStrand . "\n"; # Write FASTA defline | |
| 1621 } | |
| 1622 print FULLPARTIAL @seqDNA[$best] . "\n"; # Write DNA sequence | |
| 1623 } | |
| 1624 } | |
| 1625 } | |
| 1626 } else { # If sequence is regarded chimeric | |
| 1627 $itsChimeric++; # Add one to the chimeric counter | |
| 1628 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1629 print CHIMERA $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1630 } else { | |
| 1631 print CHIMERA ">" . @seqID[$best] . "|" . @seqITSType[$best] ." Chimeric " . $extendedType . " (" . length(@seqDNA[$best]) . " bp) on " . $extendedStrand . "\n"; # Write FASTA definition line | |
| 1632 } | |
| 1633 print CHIMERA @seqDNA[$best] . "\n"; # Write DNA sequence | |
| 1634 $foundChimera++; | |
| 1635 } | |
| 1636 } else { # If only the ITS part of the sequence should be saved to output file | |
| 1637 $fastaStartPoint = @dnaStart[$best] - 1; # Start extraction at the start of the first domain | |
| 1638 $fastaEndPoint = @dnaEnd[$best]+10; # End extraction 10 bp after the last domain | |
| 1639 | |
| 1640 if (@seqDomCounts[$best] > 1) { # If more than one domain was found | |
| 1641 if (substr(@startDomain[$best],0,5) eq "1_SSU") { # If the first domain was SSU | |
| 1642 $fastaStartPoint = @ssuEnd[$best]; # Set the start point of the extraction to the end of the SSU domain | |
| 1643 } | |
| 1644 if (substr(@startDomain[$best],0,5) eq "4_LSU") { # If the first domain was LSU | |
| 1645 $fastaStartPoint = @lsuEnd[$best]; # Set the start point of the extraction to the end of the LSU domain | |
| 1646 } | |
| 1647 if (substr(@endDomain[$best],0,5) eq "1_SSU") { # If the last domain was SSU | |
| 1648 $fastaEndPoint = @ssuStart[$best] - 1; # Set the end point of the extraction to the start of the SSU domain | |
| 1649 } | |
| 1650 if (substr(@endDomain[$best],0,5) eq "4_LSU") { # If the last domain was LSU | |
| 1651 $fastaEndPoint = @lsuStart[$best] - 1; # Set the end point of the extraction to the start of the LSU domain | |
| 1652 } | |
| 1653 } | |
| 1654 | |
| 1655 if ($fastaStartPoint < 0) { # If the start point is smaller than zero, set the start point to zero | |
| 1656 $fastaStartPoint = 0; | |
| 1657 } | |
| 1658 if ($fastaEndPoint > length(@seqDNA[$best])) { # If the end point is larger than the length of the sequence, set the end point to the sequence end | |
| 1659 $fastaEndPoint = length(@seqDNA[$best]); | |
| 1660 } | |
| 1661 | |
| 1662 $fastaLength = $fastaEndPoint - $fastaStartPoint + 1; | |
| 1663 | |
| 1664 if ($chimeric == 0) { # If the sequence is not chimeric | |
| 1665 if (@problemCode[$best] !~ m/[SL]/) { | |
| 1666 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1667 print FASTA $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1668 } else { | |
| 1669 print FASTA ">" . @seqID[$best] . "|" . @seqITSType[$best] . " " . $extendedType . " (" . $fastaLength . " bp) on " . $extendedStrand . "\n"; # Write FASTA definition line | |
| 1670 } | |
| 1671 if ($anchorLen > 0) { | |
| 1672 if ($fastaStartPoint - @allanchorLens[0] > 0) { | |
| 1673 print FASTA substr(@seqDNA[$best],$fastaStartPoint - @allanchorLens[0],$fastaLength + @allanchorLens[0] + @allanchorLens[3]) . "\n"; # Write DNA sequence | |
| 1674 } else { | |
| 1675 print FASTA substr(@seqDNA[$best],0,$fastaLength + $fastaStartPoint + @allanchorLens[3]) . "\n"; # Write DNA sequence | |
| 1676 } | |
| 1677 } else { | |
| 1678 print FASTA substr(@seqDNA[$best],$fastaStartPoint,$fastaLength) . "\n"; # Write DNA sequence | |
| 1679 } | |
| 1680 } | |
| 1681 if ($out_partial > 0) { | |
| 1682 if (@problemCode[$best] !~ m/[SL125]/) { | |
| 1683 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1684 print FULLPARTIAL $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1685 } else { | |
| 1686 print FULLPARTIAL ">" . @seqID[$best] . "|" . @seqITSType[$best] ." " . $extendedType . " (" . $seqDNALength . " bp) Full ITS region on " . $extendedStrand . "\n"; # Write FASTA defline | |
| 1687 } | |
| 1688 print FULLPARTIAL substr(@seqDNA[$best],$fastaStartPoint,$fastaLength) . "\n"; # Write DNA sequence | |
| 1689 } else { | |
| 1690 $its1PartLen = @its1End[$best] - @its1Start[$best] + 1; | |
| 1691 $its2PartLen = @its2End[$best] - @its2Start[$best] + 1; | |
| 1692 if (($out_partial < $its1PartLen) && ($out_partial < $its2PartLen)) { | |
| 1693 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1694 print FULLPARTIAL $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1695 } else { | |
| 1696 print FULLPARTIAL ">" . @seqID[$best] . "|" . @seqITSType[$best] ." " . $extendedType . " (" . $seqDNALength . " bp) Partial ITS region on " . $extendedStrand . "\n"; # Write FASTA defline | |
| 1697 } | |
| 1698 print FULLPARTIAL substr(@seqDNA[$best],$fastaStartPoint,$fastaLength) . "\n"; # Write DNA sequence | |
| 1699 } | |
| 1700 } | |
| 1701 } | |
| 1702 } else { # If sequence is regarded chimeric | |
| 1703 $itsChimeric++; # Add one to the chimeric counter | |
| 1704 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1705 print CHIMERA $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1706 } else { | |
| 1707 print CHIMERA ">" . @seqID[$best] . "|" . @seqITSType[$best] . " Chimeric " . $extendedType . " (" . $fastaLength . " bp) on " . $extendedStrand . "\n"; # Write FASTA definition line | |
| 1708 } | |
| 1709 print CHIMERA @seqDNA[$best] . "\n"; # Write DNA sequence | |
| 1710 $foundChimera++; | |
| 1711 } | |
| 1712 } | |
| 1713 } | |
| 1714 | |
| 1715 if ($out_joined == 1) { | |
| 1716 if ($chimeric == 0) { # If the sequence is not chimeric | |
| 1717 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1718 print JOINED $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1719 } else { | |
| 1720 print JOINED ">" . @seqID[$best] . "|" . @seqITSType[$best] . " " . $extendedType . " (" . $fastaLength . " bp) From domain " . @startDomain[$best] . " to " . @endDomain[$best] . " on " . $extendedStrand . " Found domains: "; # Write FASTA definition line | |
| 1721 print JOINED substr(@allSeqDomains[$best],0,length(@allSeqDomains[$best]) - 1) . "\n"; # Write domain order | |
| 1722 } | |
| 1723 print JOINED substr(@seqDNA[$best],$fastaStartPoint,$fastaLength) . "\n"; # Write DNA sequence | |
| 1724 } else { # If sequence is regarded chimeric | |
| 1725 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1726 print JOINED $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1727 } else { | |
| 1728 print JOINED ">" . @seqID[$best] . "|" . @seqITSType[$best] . " Chimeric " . $extendedType . " (" . $fastaLength . " bp) From domain " . @startDomain[$best] . " to " . @endDomain[$best] . " on " . $extendedStrand . " Found domains: "; # Write FASTA definition line | |
| 1729 print JOINED substr(@allSeqDomains[$best],0,length(@allSeqDomains[$best]) - 1) . "\n"; # Write domain order | |
| 1730 } | |
| 1731 print JOINED substr(@seqDNA[$best],$fastaStartPoint,$fastaLength) . "\n"; # Write DNA sequence | |
| 1732 } | |
| 1733 } | |
| 1734 | |
| 1735 ## Write SSU sequence to file | |
| 1736 if ($out_ssu == 1) { # If SSU output is on | |
| 1737 if (@problemCode[$best] !~ m/S/) { | |
| 1738 if ($only_full == 0) { | |
| 1739 $seqPartLen = @ssuEnd[$best] - @ssuStart[$best] + 1; | |
| 1740 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1741 print SSU $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1742 } else { | |
| 1743 print SSU ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|SSU " . "Extracted SSU sequence (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 1744 } | |
| 1745 if (uc($anchor) ne "HMM") { | |
| 1746 if (@ssuStart[$best] - 1 - $anchorLen > 0) { | |
| 1747 print SSU substr(@seqDNA[$best], @ssuStart[$best] - 1 - $anchorLen, $anchorLen); | |
| 1748 } else { | |
| 1749 print SSU substr(@seqDNA[$best], 0, @ssuStart[$best] - 1); | |
| 1750 } | |
| 1751 print SSU substr(@seqDNA[$best], @ssuStart[$best] - 1, @ssuEnd[$best] - @ssuStart[$best] + 1); | |
| 1752 if (@ssuEnd[$best] - @ssuStart[$best] + 1 - $anchorLen > 0) { | |
| 1753 print SSU substr(@seqDNA[$best], @ssuEnd[$best], $anchorLen); | |
| 1754 } else { | |
| 1755 print SSU substr(@seqDNA[$best], @ssuEnd[$best]); | |
| 1756 } | |
| 1757 } else { | |
| 1758 print SSU substr(@seqDNA[$best], @ssuStart[$best] - 1, @ssuEnd[$best] - @ssuStart[$best] + 1); | |
| 1759 } | |
| 1760 print SSU "\n"; # Write DNA sequence | |
| 1761 } | |
| 1762 } | |
| 1763 } | |
| 1764 | |
| 1765 ## Write LSU sequence to file | |
| 1766 if ($out_lsu == 1) { # If LSU output is on | |
| 1767 if ($only_full == 0) { | |
| 1768 if (@problemCode[$best] !~ m/L/) { | |
| 1769 $seqPartLen = @lsuEnd[$best] - @lsuStart[$best] + 1; | |
| 1770 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1771 print LSU $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1772 } else { | |
| 1773 print LSU ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|LSU " . "Extracted LSU sequence (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 1774 } | |
| 1775 if (uc($anchor) ne "HMM") { | |
| 1776 if (@lsuStart[$best] - 1 - $anchorLen > 0) { | |
| 1777 print LSU substr(@seqDNA[$best], @lsuStart[$best] - 1 - $anchorLen, $anchorLen); | |
| 1778 } else { | |
| 1779 print LSU substr(@seqDNA[$best], 0, @lsuStart[$best] - 1); | |
| 1780 } | |
| 1781 print LSU substr(@seqDNA[$best], @lsuStart[$best] - 1, @lsuEnd[$best] - @lsuStart[$best] + 1); | |
| 1782 #if (@lsuEnd[$best] - @lsuStart[$best] + 1 - $anchorLen > 0) { | |
| 1783 # print LSU substr(@seqDNA[$best], @lsuEnd[$best] - @lsuStart[$best] + 1, $anchorLen); | |
| 1784 #} else { | |
| 1785 # print LSU substr(@seqDNA[$best], @lsuEnd[$best] - @lsuStart[$best] + 1); | |
| 1786 #} | |
| 1787 } else { | |
| 1788 print LSU substr(@seqDNA[$best], @lsuStart[$best] - 1, @lsuEnd[$best] - @lsuStart[$best] + 1); | |
| 1789 } | |
| 1790 print LSU "\n"; # Write DNA sequence | |
| 1791 } | |
| 1792 } | |
| 1793 } | |
| 1794 | |
| 1795 ## Write 5.8S sequence to file | |
| 1796 if ($out_58S == 1) { # If 5.8S output is on | |
| 1797 if (@problemCode[$best] !~ m/[125]/) { | |
| 1798 $seqPartLen = @midEnd[$best] - @midStart[$best] + 1; | |
| 1799 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1800 print MID $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1801 } else { | |
| 1802 print MID ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|5.8S " . "Extracted 5.8S sequence (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 1803 } | |
| 1804 if (uc($anchor) ne "HMM") { | |
| 1805 if (@midStart[$best] - 1 - $anchorLen > 0) { | |
| 1806 print MID substr(@seqDNA[$best], @midStart[$best] - 1 - $anchorLen, $anchorLen); | |
| 1807 } else { | |
| 1808 print MID substr(@seqDNA[$best], 0, @midStart[$best] - 1); | |
| 1809 } | |
| 1810 print MID substr(@seqDNA[$best], @midStart[$best] - 1, @midEnd[$best] - @midStart[$best] + 1); | |
| 1811 if (@midEnd[$best] - @midStart[$best] + 1 - $anchorLen > 0) { | |
| 1812 print MID substr(@seqDNA[$best], @midEnd[$best], $anchorLen); | |
| 1813 } else { | |
| 1814 print MID substr(@seqDNA[$best], @midEnd[$best]); | |
| 1815 } | |
| 1816 } else { | |
| 1817 print MID substr(@seqDNA[$best], @midStart[$best] - 1, @midEnd[$best] - @midStart[$best] + 1); | |
| 1818 } | |
| 1819 print MID "\n"; # Write DNA sequence | |
| 1820 } | |
| 1821 } | |
| 1822 | |
| 1823 ## Write ITS1 sequence to file | |
| 1824 if ($out_its1 == 1) { # If ITS1 output is on | |
| 1825 $seqPartLen = @its1End[$best] - @its1Start[$best] + 1; | |
| 1826 if ($seqPartLen > 1) { | |
| 1827 if ( (($only_full == 0) && (@problemCode[$best] !~ m/[15]/)) || (($only_full == 1) && (@problemCode[$best] !~ m/[S15]/))) { | |
| 1828 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1829 print ITS1 $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1830 } else { | |
| 1831 print ITS1 ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS1 " . "Extracted ITS1 sequence " . @its1Start[$best] . "-" . @its1End[$best] . " (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 1832 } | |
| 1833 if (uc($anchor) ne "HMM") { | |
| 1834 if (@its1Start[$best] - 1 - $anchorLen > 0) { | |
| 1835 print ITS1 substr(@seqDNA[$best], @its1Start[$best] - 1 - $anchorLen, $anchorLen); | |
| 1836 } else { | |
| 1837 print ITS1 substr(@seqDNA[$best], 0, @its1Start[$best] - 1); | |
| 1838 } | |
| 1839 print ITS1 substr(@seqDNA[$best], @its1Start[$best] - 1, @its1End[$best] - @its1Start[$best] + 1); | |
| 1840 if (length(@seqDNA[$best]) - @its1End[$best] - $anchorLen > 0) { | |
| 1841 print ITS1 substr(@seqDNA[$best], @its1End[$best], $anchorLen); | |
| 1842 } else { | |
| 1843 print ITS1 substr(@seqDNA[$best], @its1End[$best]); | |
| 1844 } | |
| 1845 } else { | |
| 1846 if (@its1Start[$best] - 1 - @allanchorLens[0] > 0) { | |
| 1847 print ITS1 substr(@seqDNA[$best], @its1Start[$best] - 1 - @allanchorLens[0], @allanchorLens[0]); | |
| 1848 } else { | |
| 1849 print ITS1 substr(@seqDNA[$best], 0, @its1Start[$best] - 1); | |
| 1850 } | |
| 1851 print ITS1 substr(@seqDNA[$best], @its1Start[$best] - 1, @its1End[$best] - @its1Start[$best] + 1); | |
| 1852 if (length(@seqDNA[$best]) - @its1End[$best] - @allanchorLens[1] > 0) { | |
| 1853 print ITS1 substr(@seqDNA[$best], @its1End[$best], @allanchorLens[1]); | |
| 1854 } else { | |
| 1855 print ITS1 substr(@seqDNA[$best], @its1End[$best]); | |
| 1856 } | |
| 1857 | |
| 1858 } | |
| 1859 print ITS1 "\n"; # Write DNA sequence | |
| 1860 } | |
| 1861 } | |
| 1862 if ($out_partial > 0) { | |
| 1863 if (@problemCode[$best] !~ m/[S15]/) { | |
| 1864 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1865 print ITS1PARTIAL $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1866 } else { | |
| 1867 print ITS1PARTIAL ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS1 " . "Extracted Full ITS1 sequence " . @its1Start[$best] . "-" . @its1End[$best] . " (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 1868 } | |
| 1869 if (uc($anchor) ne "HMM") { | |
| 1870 if (@its1Start[$best] - 1 - $anchorLen > 0) { | |
| 1871 print ITS1PARTIAL substr(@seqDNA[$best], @its1Start[$best] - 1 - $anchorLen, $anchorLen); | |
| 1872 } else { | |
| 1873 print ITS1PARTIAL substr(@seqDNA[$best], 0, @its1Start[$best] - 1); | |
| 1874 } | |
| 1875 print ITS1PARTIAL substr(@seqDNA[$best], @its1Start[$best] - 1, @its1End[$best] - @its1Start[$best] + 1); | |
| 1876 if (length(@seqDNA[$best]) - @its1End[$best] - $anchorLen > 0) { | |
| 1877 print ITS1PARTIAL substr(@seqDNA[$best], @its1End[$best], $anchorLen); | |
| 1878 } else { | |
| 1879 print ITS1PARTIAL substr(@seqDNA[$best], @its1End[$best]); | |
| 1880 } | |
| 1881 } else { | |
| 1882 if (@its1Start[$best] - 1 - @allanchorLens[0] > 0) { | |
| 1883 print ITS1PARTIAL substr(@seqDNA[$best], @its1Start[$best] - 1 - @allanchorLens[0], @allanchorLens[0]); | |
| 1884 } else { | |
| 1885 print ITS1PARTIAL substr(@seqDNA[$best], 0, @its1Start[$best] - 1); | |
| 1886 } | |
| 1887 print ITS1PARTIAL substr(@seqDNA[$best], @its1Start[$best] - 1, @its1End[$best] - @its1Start[$best] + 1); | |
| 1888 if (length(@seqDNA[$best]) - @its1End[$best] - @allanchorLens[1] > 0) { | |
| 1889 print ITS1PARTIAL substr(@seqDNA[$best], @its1End[$best], @allanchorLens[1]); | |
| 1890 } else { | |
| 1891 print ITS1PARTIAL substr(@seqDNA[$best], @its1End[$best]); | |
| 1892 } | |
| 1893 | |
| 1894 } | |
| 1895 print ITS1PARTIAL "\n"; # Write DNA sequence | |
| 1896 } else { | |
| 1897 if ($out_partial < $seqPartLen) { | |
| 1898 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1899 print ITS1PARTIAL $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1900 } else { | |
| 1901 print ITS1PARTIAL ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS1 " . "Extracted Partial ITS1 sequence " . @its1Start[$best] . "-" . @its1End[$best] . " (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 1902 } | |
| 1903 if (uc($anchor) ne "HMM") { | |
| 1904 if (@its1Start[$best] - 1 - $anchorLen > 0) { | |
| 1905 print ITS1PARTIAL substr(@seqDNA[$best], @its1Start[$best] - 1 - $anchorLen, $anchorLen); | |
| 1906 } else { | |
| 1907 print ITS1PARTIAL substr(@seqDNA[$best], 0, @its1Start[$best] - 1); | |
| 1908 } | |
| 1909 print ITS1PARTIAL substr(@seqDNA[$best], @its1Start[$best] - 1, @its1End[$best] - @its1Start[$best] + 1); | |
| 1910 if (length(@seqDNA[$best]) - @its1End[$best] - $anchorLen > 0) { | |
| 1911 print ITS1PARTIAL substr(@seqDNA[$best], @its1End[$best], $anchorLen); | |
| 1912 } else { | |
| 1913 print ITS1PARTIAL substr(@seqDNA[$best], @its1End[$best]); | |
| 1914 } | |
| 1915 } else { | |
| 1916 if (@its1Start[$best] - 1 - @allanchorLens[0] > 0) { | |
| 1917 print ITS1PARTIAL substr(@seqDNA[$best], @its1Start[$best] - 1 - @allanchorLens[0], @allanchorLens[0]); | |
| 1918 } else { | |
| 1919 print ITS1PARTIAL substr(@seqDNA[$best], 0, @its1Start[$best] - 1); | |
| 1920 } | |
| 1921 print ITS1PARTIAL substr(@seqDNA[$best], @its1Start[$best] - 1, @its1End[$best] - @its1Start[$best] + 1); | |
| 1922 if (length(@seqDNA[$best]) - @its1End[$best] - @allanchorLens[1] > 0) { | |
| 1923 print ITS1PARTIAL substr(@seqDNA[$best], @its1End[$best], @allanchorLens[1]); | |
| 1924 } else { | |
| 1925 print ITS1PARTIAL substr(@seqDNA[$best], @its1End[$best]); | |
| 1926 } | |
| 1927 } | |
| 1928 print ITS1PARTIAL "\n"; # Write DNA sequence | |
| 1929 } | |
| 1930 } | |
| 1931 } | |
| 1932 } | |
| 1933 | |
| 1934 ## Write ITS2 sequence to file | |
| 1935 if ($out_its2 == 1) { # If ITS2 output is on | |
| 1936 $seqPartLen = @its2End[$best] - @its2Start[$best] + 1; | |
| 1937 if ($seqPartLen > 1) { | |
| 1938 if ( (($only_full == 0) && (@problemCode[$best] !~ m/[25]/)) || (($only_full == 1) && (@problemCode[$best] !~ m/[L25]/))) { | |
| 1939 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1940 print ITS2 $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1941 } else { | |
| 1942 print ITS2 ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS2 " . "Extracted ITS2 sequence " . @its2Start[$best] . "-" . @its2End[$best] . " (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 1943 } | |
| 1944 if (uc($anchor) ne "HMM") { | |
| 1945 if (@its2Start[$best] - 1 - $anchorLen > 0) { | |
| 1946 print ITS2 substr(@seqDNA[$best], @its2Start[$best] - 1 - $anchorLen, $anchorLen); | |
| 1947 } else { | |
| 1948 print ITS2 substr(@seqDNA[$best], 0, @its2Start[$best] - 1); | |
| 1949 } | |
| 1950 print ITS2 substr(@seqDNA[$best], @its2Start[$best] - 1, @its2End[$best] - @its2Start[$best] + 1); | |
| 1951 if (length(@seqDNA[$best]) - @its2End[$best] - $anchorLen > 0) { | |
| 1952 print ITS2 substr(@seqDNA[$best], @its2End[$best], $anchorLen); | |
| 1953 } else { | |
| 1954 print ITS2 substr(@seqDNA[$best], @its2End[$best]); | |
| 1955 } | |
| 1956 } else { | |
| 1957 if (@its2Start[$best] - 1 - @allanchorLens[2] > 0) { | |
| 1958 print ITS2 substr(@seqDNA[$best], @its2Start[$best] - 1 - @allanchorLens[2], @allanchorLens[2]); | |
| 1959 } else { | |
| 1960 print ITS2 substr(@seqDNA[$best], 0, @its2Start[$best] - 1); | |
| 1961 } | |
| 1962 print ITS2 substr(@seqDNA[$best], @its2Start[$best] - 1, @its2End[$best] - @its2Start[$best] + 1); | |
| 1963 if (length(@seqDNA[$best]) - @its2End[$best] - @allanchorLens[3] > 0) { | |
| 1964 print ITS2 substr(@seqDNA[$best], @its2End[$best], @allanchorLens[3]); | |
| 1965 } else { | |
| 1966 print ITS2 substr(@seqDNA[$best], @its2End[$best]); | |
| 1967 } | |
| 1968 } | |
| 1969 print ITS2 "\n"; # Write DNA sequence | |
| 1970 } | |
| 1971 if ($out_partial > 0) { | |
| 1972 if (@problemCode[$best] !~ m/[L25]/) { | |
| 1973 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 1974 print ITS2PARTIAL $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 1975 } else { | |
| 1976 print ITS2PARTIAL ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS2 " . "Extracted Full ITS2 sequence " . @its2Start[$best] . "-" . @its2End[$best] . " (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 1977 } | |
| 1978 if (uc($anchor) ne "HMM") { | |
| 1979 if (@its2Start[$best] - 1 - $anchorLen > 0) { | |
| 1980 print ITS2PARTIAL substr(@seqDNA[$best], @its2Start[$best] - 1 - $anchorLen, $anchorLen); | |
| 1981 } else { | |
| 1982 print ITS2PARTIAL substr(@seqDNA[$best], 0, @its2Start[$best] - 1); | |
| 1983 } | |
| 1984 print ITS2PARTIAL substr(@seqDNA[$best], @its2Start[$best] - 1, @its2End[$best] - @its2Start[$best] + 1); | |
| 1985 if (length(@seqDNA[$best]) - @its2End[$best] - $anchorLen > 0) { | |
| 1986 print ITS2PARTIAL substr(@seqDNA[$best], @its2End[$best], $anchorLen); | |
| 1987 } else { | |
| 1988 print ITS2PARTIAL substr(@seqDNA[$best], @its2End[$best]); | |
| 1989 } | |
| 1990 } else { | |
| 1991 if (@its2Start[$best] - 1 - @allanchorLens[2] > 0) { | |
| 1992 print ITS2PARTIAL substr(@seqDNA[$best], @its2Start[$best] - 1 - @allanchorLens[2], @allanchorLens[2]); | |
| 1993 } else { | |
| 1994 print ITS2PARTIAL substr(@seqDNA[$best], 0, @its2Start[$best] - 1); | |
| 1995 } | |
| 1996 print ITS2PARTIAL substr(@seqDNA[$best], @its2Start[$best] - 1, @its2End[$best] - @its2Start[$best] + 1); | |
| 1997 if (length(@seqDNA[$best]) - @its2End[$best] - @allanchorLens[3] > 0) { | |
| 1998 print ITS2PARTIAL substr(@seqDNA[$best], @its2End[$best], @allanchorLens[3]); | |
| 1999 } else { | |
| 2000 print ITS2PARTIAL substr(@seqDNA[$best], @its2End[$best]); | |
| 2001 } | |
| 2002 } | |
| 2003 print ITS2PARTIAL "\n"; # Write DNA sequence | |
| 2004 } else { | |
| 2005 if ($out_partial < $seqPartLen) { | |
| 2006 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 2007 print ITS2PARTIAL $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 2008 } else { | |
| 2009 print ITS2PARTIAL ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS2 " . "Extracted Partial ITS2 sequence " . @its2Start[$best] . "-" . @its2End[$best] . " (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 2010 } | |
| 2011 if (uc($anchor) ne "HMM") { | |
| 2012 if (@its2Start[$best] - 1 - $anchorLen > 0) { | |
| 2013 print ITS2PARTIAL substr(@seqDNA[$best], @its2Start[$best] - 1 - $anchorLen, $anchorLen); | |
| 2014 } else { | |
| 2015 print ITS2PARTIAL substr(@seqDNA[$best], 0, @its2Start[$best] - 1); | |
| 2016 } | |
| 2017 print ITS2PARTIAL substr(@seqDNA[$best], @its2Start[$best] - 1, @its2End[$best] - @its2Start[$best] + 1); | |
| 2018 if (length(@seqDNA[$best]) - @its2End[$best] - $anchorLen > 0) { | |
| 2019 print ITS2PARTIAL substr(@seqDNA[$best], @its2End[$best], $anchorLen); | |
| 2020 } else { | |
| 2021 print ITS2PARTIAL substr(@seqDNA[$best], @its2End[$best]); | |
| 2022 } | |
| 2023 } else { | |
| 2024 if (@its2Start[$best] - 1 - @allanchorLens[2] > 0) { | |
| 2025 print ITS2PARTIAL substr(@seqDNA[$best], @its2Start[$best] - 1 - @allanchorLens[2], @allanchorLens[2]); | |
| 2026 } else { | |
| 2027 print ITS2PARTIAL substr(@seqDNA[$best], 0, @its2Start[$best] - 1); | |
| 2028 } | |
| 2029 print ITS2PARTIAL substr(@seqDNA[$best], @its2Start[$best] - 1, @its2End[$best] - @its2Start[$best] + 1); | |
| 2030 if (length(@seqDNA[$best]) - @its2End[$best] - @allanchorLens[3] > 0) { | |
| 2031 print ITS2PARTIAL substr(@seqDNA[$best], @its2End[$best], @allanchorLens[3]); | |
| 2032 } else { | |
| 2033 print ITS2PARTIAL substr(@seqDNA[$best], @its2End[$best]); | |
| 2034 } | |
| 2035 } | |
| 2036 print ITS2PARTIAL "\n"; # Write DNA sequence | |
| 2037 } | |
| 2038 } | |
| 2039 } | |
| 2040 } | |
| 2041 } | |
| 2042 | |
| 2043 ## Output concatenated ITS1 + ITS2 sequences | |
| 2044 if ($out_concat == 1) { | |
| 2045 $seqPartLen1 = @its1End[$best] - @its1Start[$best] + 1; | |
| 2046 $seqPartLen2 = @its2End[$best] - @its2Start[$best] + 1; | |
| 2047 if (($seqPartLen1 >= $concat_minlen) && ($seqPartLen2 >= $concat_minlen)) { | |
| 2048 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 2049 print CONCAT $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 2050 } else { | |
| 2051 print CONCAT ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS1+2 " . "Concatenated ITS1 and ITS2 sequences (" . ($seqPartLen1 + $seqPartLen2) . " bp)\n"; # Write FASTA definition line | |
| 2052 } | |
| 2053 print CONCAT substr(@seqDNA[$best], @its1Start[$best] - 1, @its1End[$best] - @its1Start[$best] + 1); # Write ITS1 DNA sequence | |
| 2054 print CONCAT "-----"; # Write spacer | |
| 2055 print CONCAT substr(@seqDNA[$best], @its2Start[$best] - 1, @its2End[$best] - @its2Start[$best] + 1); # Write ITS2 DNA sequence | |
| 2056 print CONCAT "\n"; # Write newline | |
| 2057 } else { | |
| 2058 if ($seqPartLen1 >= $concat_minlen) { | |
| 2059 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 2060 print CONCAT $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 2061 } else { | |
| 2062 print CONCAT ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS1 " . "ITS1 sequence (ITS2 too short) (" . $seqPartLen1 . " bp)\n"; # Write FASTA definition line | |
| 2063 } | |
| 2064 print CONCAT substr(@seqDNA[$best], @its1Start[$best] - 1, @its1End[$best] - @its1Start[$best] + 1); # Write ITS1 DNA sequence | |
| 2065 print CONCAT "-----"; # Write spacer | |
| 2066 print CONCAT "\n"; # Write newline | |
| 2067 } | |
| 2068 if ($seqPartLen2 >= $concat_minlen) { | |
| 2069 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 2070 print CONCAT $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 2071 } else { | |
| 2072 print CONCAT ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS2 " . "ITS2 sequence (ITS1 too short) (" . $seqPartLen2 . " bp)\n"; # Write FASTA definition line | |
| 2073 } | |
| 2074 print CONCAT "-----"; # Write spacer | |
| 2075 print CONCAT substr(@seqDNA[$best], @its2Start[$best] - 1, @its2End[$best] - @its2Start[$best] + 1); # Write ITS2 DNA sequence | |
| 2076 print CONCAT "\n"; # Write newline | |
| 2077 } | |
| 2078 } | |
| 2079 } | |
| 2080 | |
| 2081 ## Print all sequences to the joined file for debugging | |
| 2082 if ($out_joined == 1) { | |
| 2083 $seqPartLen = @ssuEnd[$best] - @ssuStart[$best] + 1; | |
| 2084 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 2085 print JOINED $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 2086 } else { | |
| 2087 print JOINED ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|SSU " . "Extracted SSU sequence (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 2088 } | |
| 2089 print JOINED substr(@seqDNA[$best], @ssuStart[$best] - 1, @ssuEnd[$best] - @ssuStart[$best] + 1) . "\n"; # Write DNA sequence | |
| 2090 | |
| 2091 $seqPartLen = @its1End[$best] - @its1Start[$best] + 1; | |
| 2092 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 2093 print JOINED $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 2094 } else { | |
| 2095 print JOINED ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS1 " . "Extracted ITS1 sequence " . @its1Start[$best] . "-" . @its1End[$best] . " (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 2096 } | |
| 2097 print JOINED substr(@seqDNA[$best], @its1Start[$best] - 1, @its1End[$best] - @its1Start[$best] + 1) . "\n"; # Write DNA sequence | |
| 2098 | |
| 2099 $seqPartLen = @midEnd[$best] - @midStart[$best] + 1; | |
| 2100 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 2101 print JOINED $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 2102 } else { | |
| 2103 print JOINED ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|5.8S " . "Extracted 5.8S sequence (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 2104 } | |
| 2105 print JOINED substr(@seqDNA[$best], @midStart[$best] - 1, @midEnd[$best] - @midStart[$best] + 1) . "\n"; # Write DNA sequence | |
| 2106 | |
| 2107 $seqPartLen = @its2End[$best] - @its2Start[$best] + 1; | |
| 2108 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 2109 print JOINED $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 2110 } else { | |
| 2111 print JOINED ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|ITS2 " . "Extracted ITS2 sequence (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 2112 } | |
| 2113 print JOINED substr(@seqDNA[$best], @its2Start[$best] - 1, @its2End[$best] - @its2Start[$best] + 1) . "\n"; # Write DNA sequence | |
| 2114 | |
| 2115 $seqPartLen = @lsuEnd[$best] - @lsuStart[$best] + 1; | |
| 2116 if ($out_preserve == 1) { # If sequence headers should be preserved | |
| 2117 print JOINED $headers{@seqID[$best]} . "\n"; # Write FASTA definition line | |
| 2118 } else { | |
| 2119 print JOINED ">" . @seqID[$best] . "|" . @seqITSType[$best] . "|LSU " . "Extracted LSU sequence (" . $seqPartLen . " bp)\n"; # Write FASTA definition line | |
| 2120 } | |
| 2121 print JOINED substr(@seqDNA[$best], @lsuStart[$best] - 1, @lsuEnd[$best] - @lsuStart[$best] + 1) . "\n"; # Write DNA sequence | |
| 2122 } | |
| 2123 | |
| 2124 | |
| 2125 if ($out_results == 1) { | |
| 2126 ## Print info on all matches, also not top ones to the results file... | |
| 2127 for ($i = 0; $i < scalar(@seqITSType); $i++) { # Go through all the possible ITS types... | |
| 2128 print RESULTS @seqITSType[$i] . ": " . @seqDomCounts[$i] . " " . @seqAvgE[$i] . " " . @seqAvgScore[$i]; # Write some info on this type (Type, Domain count, Average E-value, Average score) | |
| 2129 if ($i < scalar(@seqITSType) - 1) { # If this is no the last domain type | |
| 2130 print RESULTS ", "; # Write a comma | |
| 2131 } | |
| 2132 } | |
| 2133 print RESULTS "\n"; # Write end of line | |
| 2134 } | |
| 2135 } | |
| 2136 | |
| 2137 ## Undefine all used arrays for the next round... | |
| 2138 undef @seqID; | |
| 2139 undef @seqITSType; | |
| 2140 undef @seqCompl; | |
| 2141 undef @seqDomCounts; | |
| 2142 undef @seqAvgE; | |
| 2143 undef @seqAvgScore; | |
| 2144 undef @seqScoreSum; | |
| 2145 undef @seqDNA; | |
| 2146 undef @allSeqDomains; | |
| 2147 undef @dnaStart; | |
| 2148 undef @dnaEnd; | |
| 2149 undef @ssuStart; | |
| 2150 undef @ssuEnd; | |
| 2151 undef @lsuStart; | |
| 2152 undef @lsuEnd; | |
| 2153 undef @midStart; | |
| 2154 undef @midEnd; | |
| 2155 undef @its1Start; | |
| 2156 undef @its1End; | |
| 2157 undef @its2Start; | |
| 2158 undef @its2End; | |
| 2159 undef @startDomain; | |
| 2160 undef @endDomain; | |
| 2161 undef @domain_order; | |
| 2162 undef @sorted_domain_order; | |
| 2163 undef @problem; | |
| 2164 undef @problemCode; | |
| 2165 undef @anchorLens; | |
| 2166 } | |
| 2167 if (@item[0] ne "") { # Add this entry to the set (regardless if the entry has the same ID as entries already in the set), as long as it is non-empty | |
| 2168 push(@seqID, @item[0]); # Add sequence ID | |
| 2169 push(@seqITSType, @item[1]); # Add ITS type | |
| 2170 push(@seqCompl, @item[2]); # Add main/complementary strand info | |
| 2171 push(@seqDomCounts, @item[3]); # Add domain count | |
| 2172 push(@seqAvgE, @item[4]); # Add average E-value | |
| 2173 push(@seqAvgScore, @item[5]); # Add average score | |
| 2174 push(@seqScoreSum, @item[6]); # Add sum-of-scores | |
| 2175 push(@seqDNA, @item[7]); # Add DNA sequence | |
| 2176 | |
| 2177 ## Determine first and last domains, and their positions | |
| 2178 ## Set variables to unrealistic values | |
| 2179 $dnaEnd = length(@item[7]); | |
| 2180 $startDomain = "***"; | |
| 2181 $endDomain = "***"; | |
| 2182 $dnaStart = 1; | |
| 2183 $allDomains = ""; | |
| 2184 $ssuStart = 1; | |
| 2185 $ssuEnd = 0; | |
| 2186 $midStart = 1; | |
| 2187 $midEnd = 0; | |
| 2188 $lsuStart = 1; | |
| 2189 $lsuEnd = 0; | |
| 2190 $its1Start = 1; | |
| 2191 $its1End = length(@item[7]); | |
| 2192 $its2Start = 1; | |
| 2193 $its2End = length(@item[7]); | |
| 2194 $problem = ""; | |
| 2195 $problemCode = ""; | |
| 2196 | |
| 2197 $ssuFound = 0; | |
| 2198 $lsuFound = 0; | |
| 2199 $midSFound = 0; | |
| 2200 $midEFound = 0; | |
| 2201 $order = ""; | |
| 2202 undef @hitanchorlens; | |
| 2203 | |
| 2204 for ($i = 8; $i < scalar(@item); $i++) { # Go through the list of found domains in this sequence | |
| 2205 ($hitStart,$hitEnd,$hitProfile,$hitScore,$hitEval,$hitanchorlen) = split(';',@item[$i]); # Separate the hit stats into variables | |
| 2206 $allDomains = $allDomains . $hitProfile . " "; # Add found domain to the list of all domains | |
| 2207 | |
| 2208 # if ($hitStart < $dnaStart) { # If this domain is the first one so far | |
| 2209 # $dnaStart = $hitStart; # Set the start of the ITS sequence to this domain's start | |
| 2210 # $startDomain = $hitProfile; # Set this domain as the starting domain | |
| 2211 # } | |
| 2212 # if ($hitEnd > $dnaEnd) { # If this domain is the last one so far | |
| 2213 # $dnaEnd = $hitEnd; # Set the end of the ITS sequence to this domain's end | |
| 2214 # $endDomain = $hitProfile; # Set this domain as the ending domain | |
| 2215 # } | |
| 2216 | |
| 2217 if (substr($hitProfile,0,5) eq "1_SSU" ) { # If this domain is the SSU's end | |
| 2218 $dnaStart = $hitStart; # Set the start of the ITS sequence to this domain's start | |
| 2219 $startDomain = $hitProfile; # Set this domain as the starting domain | |
| 2220 $ssuStart = 1; # Set the start of the SSU sequence to the start of the sequence | |
| 2221 $ssuEnd = $hitEnd; # Set the end of the SSU sequence to this domain's end | |
| 2222 $its1Start = $hitEnd + 1; # Set the start of the ITS1 sequence to right after this domain's end | |
| 2223 if ($midEFound == 0) { | |
| 2224 $its2Start = $hitEnd + 1; # Set the end of the ITS2 sequence to right after this domain's end | |
| 2225 } | |
| 2226 $ssuFound = 1; | |
| 2227 $order = $order . "1"; | |
| 2228 @hitanchorlens[1] = $hitanchorlen; | |
| 2229 } | |
| 2230 | |
| 2231 if (substr($hitProfile,0,5) eq "2_5.8" ) { # If this domain is the 5.8S's start | |
| 2232 if ($startDomain eq "***") { | |
| 2233 $dnaStart = 1; # Set the start of the ITS sequence to the start of the sequence | |
| 2234 $startDomain = $hitProfile; # Set this domain as the starting domain | |
| 2235 } | |
| 2236 $its1End = $hitStart - 1; # Set the end of the ITS1 sequence to right before this domain's start | |
| 2237 if ($midEFound == 0) { | |
| 2238 $its2Start = $hitEnd + 1; # Set the start of the ITS2 sequence to right after this domain's end | |
| 2239 } | |
| 2240 $midStart = $hitStart; # Set the start of the 5.8S sequence to this domain's start | |
| 2241 $midSFound = 1; | |
| 2242 $order = $order . "2"; | |
| 2243 @hitanchorlens[2] = $hitanchorlen; | |
| 2244 } | |
| 2245 | |
| 2246 if (substr($hitProfile,0,5) eq "3_End" ) { # If this domain is the 5.8S's end | |
| 2247 if ($startDomain eq "***") { | |
| 2248 $dnaStart = 1; # Set the start of the ITS sequence to the start of the sequence | |
| 2249 $startDomain = $hitProfile; # Set this domain as the starting domain | |
| 2250 } | |
| 2251 if ($midSFound == 0) { | |
| 2252 $its1End = $hitStart - 1; # Set the end of the ITS1 sequence to right before this domain's start | |
| 2253 } | |
| 2254 $its2Start = $hitEnd + 1; # Set the end of the ITS2 sequence to right after this domain's end | |
| 2255 $midEnd = $hitEnd; # Set the end of the 5.8S sequence to this domain's end | |
| 2256 $midEFound = 1; | |
| 2257 $order = $order . "3"; | |
| 2258 @hitanchorlens[3] = $hitanchorlen; | |
| 2259 } | |
| 2260 | |
| 2261 if (substr($hitProfile,0,5) eq "4_LSU" ) { # If this domain is the LSU's start | |
| 2262 if ($startDomain eq "***") { | |
| 2263 $dnaStart = 1; # Set the start of the ITS sequence to the start of the sequence | |
| 2264 $startDomain = $hitProfile; # Set this domain as the starting domain | |
| 2265 } | |
| 2266 $dnaEnd = $hitEnd; # Set the end of the ITS sequence to this domain's end | |
| 2267 $endDomain = $hitProfile; # Set this domain as the ending domain | |
| 2268 $lsuStart = $hitStart; # Set the start of the LSU sequence to the start of the this domain | |
| 2269 $lsuEnd = length(@item[7]); # Set the end of the LSU sequence to the end of the sequence | |
| 2270 $its2End = $hitStart - 1; # Set the end of the ITS2 sequence to right before this domain's start | |
| 2271 if ($midSFound == 0) { | |
| 2272 $its1End = $hitStart - 1; # Set the end of the ITS1 sequence to right before this domain's start | |
| 2273 } | |
| 2274 $lsuFound = 1; | |
| 2275 $order = $order . "4"; | |
| 2276 @hitanchorlens[4] = $hitanchorlen; | |
| 2277 } | |
| 2278 } | |
| 2279 | |
| 2280 if ($ssuFound == 0) { | |
| 2281 if ($problem ne "") { | |
| 2282 $problem = $problem . "; "; | |
| 2283 } | |
| 2284 $problem = $problem . "End of SSU sequence not found"; | |
| 2285 $problemCode = $problemCode . "S"; | |
| 2286 $order = "1" . $order; | |
| 2287 } | |
| 2288 | |
| 2289 if ($lsuFound == 0) { | |
| 2290 if ($problem ne "") { | |
| 2291 $problem = $problem . "; "; | |
| 2292 } | |
| 2293 $problem = $problem . "Start of LSU sequence not found"; | |
| 2294 $problemCode = $problemCode . "L"; | |
| 2295 $order = $order . "4"; | |
| 2296 } | |
| 2297 | |
| 2298 if (($midSFound == 0) && ($midEFound == 0)) { | |
| 2299 if ($problem ne "") { | |
| 2300 $problem = $problem . "; "; | |
| 2301 } | |
| 2302 $problem = $problem . "The 5.8S sequence was not found at all"; | |
| 2303 $problemCode = $problemCode . "5"; | |
| 2304 } else { | |
| 2305 if ($midSFound == 0) { | |
| 2306 if ($problem ne "") { | |
| 2307 $problem = $problem . "; "; | |
| 2308 } | |
| 2309 $problem = $problem . "Start of 5.8S sequence not found"; | |
| 2310 $problemCode = $problemCode . "1"; | |
| 2311 } | |
| 2312 if ($midEFound == 0) { | |
| 2313 if ($problem ne "") { | |
| 2314 $problem = $problem . "; "; | |
| 2315 } | |
| 2316 $problem = $problem . "End of 5.8S sequence not found"; | |
| 2317 $problemCode = $problemCode . "2"; | |
| 2318 } | |
| 2319 } | |
| 2320 | |
| 2321 if ($dnaEnd - $dnaStart > 1500) { | |
| 2322 if ($problem ne "") { | |
| 2323 $problem = $problem . "; "; | |
| 2324 } | |
| 2325 $problem = $problem . "ITS region is suspiciously long (> 1500 bp)"; | |
| 2326 $problemCode = $problemCode . "B"; | |
| 2327 } | |
| 2328 | |
| 2329 if (length($order) == 4) { | |
| 2330 if ($order ne "1234") { | |
| 2331 if ($problem ne "") { | |
| 2332 $problem = $problem . "; "; | |
| 2333 } | |
| 2334 $problem = $problem . "Domains found in wrong order, sequence may be chimeric"; | |
| 2335 $problemCode = $problemCode . "C"; | |
| 2336 } | |
| 2337 } else { | |
| 2338 if ((substr($order,0,1) ne "1") || (substr($order,-1) ne "4")) { | |
| 2339 if ($problem ne "") { | |
| 2340 $problem = $problem . "; "; | |
| 2341 } | |
| 2342 $problem = $problem . "Domains found in wrong order, sequence may be chimeric"; | |
| 2343 $problemCode = $problemCode . "C"; | |
| 2344 } else { | |
| 2345 if (($midSFound == 1) && ($midEFound == 1)) { | |
| 2346 if ($order !~ m/23/) { | |
| 2347 if ($problem ne "") { | |
| 2348 $problem = $problem . "; "; | |
| 2349 } | |
| 2350 $problem = $problem . "Domains found in wrong order, sequence may be chimeric"; | |
| 2351 $problemCode = $problemCode . "C"; | |
| 2352 } | |
| 2353 } | |
| 2354 } | |
| 2355 } | |
| 2356 | |
| 2357 if ($its1End - $its1Start < 0) { # 5.8S overlaps SSU | |
| 2358 if (($midSFound == 1) && ($ssuFound == 1)) { | |
| 2359 $problemCode = $problemCode . "OC"; | |
| 2360 $its1Start = 0; | |
| 2361 $its1End = 0; | |
| 2362 if ($problem ne "") { | |
| 2363 $problem = $problem . "; "; | |
| 2364 } | |
| 2365 $problem = $problem . "SSU seems to overlap 5.8S, sequence may be chimeric"; | |
| 2366 } else { | |
| 2367 $problemCode = $problemCode . "X"; | |
| 2368 $its1Start = 0; | |
| 2369 $its1End = 0; | |
| 2370 if ($problem ne "") { | |
| 2371 $problem = $problem . "; "; | |
| 2372 } | |
| 2373 $problem = $problem . "No ITS1 sequence"; | |
| 2374 } | |
| 2375 } | |
| 2376 if ($its2End - $its2Start < 0) { # 5.8S overlaps LSU | |
| 2377 if (($midEFound == 1) && ($lsuFound == 1)) { | |
| 2378 $problemCode = $problemCode . "PC"; | |
| 2379 $its2Start = 0; | |
| 2380 $its2End = 0; | |
| 2381 if ($problem ne "") { | |
| 2382 $problem = $problem . "; "; | |
| 2383 } | |
| 2384 $problem = $problem . "LSU seems to overlap 5.8S, sequence may be chimeric"; | |
| 2385 } else { | |
| 2386 $problemCode = $problemCode . "Y"; | |
| 2387 $its2Start = 0; | |
| 2388 $its2End = 0; | |
| 2389 if ($problem ne "") { | |
| 2390 $problem = $problem . "; "; | |
| 2391 } | |
| 2392 $problem = $problem . "No ITS2 sequence"; | |
| 2393 } | |
| 2394 } | |
| 2395 | |
| 2396 | |
| 2397 if (($problemCode =~ m/[15]/) && ($problemCode =~ m/[S]/)) { # Sequence lack all indications of ITS1 | |
| 2398 $its1Start = 0; | |
| 2399 $its1End = 0; | |
| 2400 } | |
| 2401 if (($problemCode =~ m/[25]/) && ($problemCode =~ m/[L]/)) { # Sequence lack all indications of ITS2 | |
| 2402 $its2Start = 0; | |
| 2403 $its2End = 0; | |
| 2404 } | |
| 2405 | |
| 2406 | |
| 2407 $allhitanchorlens = @hitanchorlens[1] . "," . @hitanchorlens[2] . "," . @hitanchorlens[3] . "," . @hitanchorlens[4]; | |
| 2408 | |
| 2409 push(@allSeqDomains,$allDomains); # Add list of all domains | |
| 2410 push(@dnaStart, $dnaStart); # Add start of the ITS sequence | |
| 2411 push(@startDomain, $startDomain); # Add start domain | |
| 2412 push(@dnaEnd, $dnaEnd); # Add end of ITS sequence | |
| 2413 push(@endDomain, $endDomain); # Add end domain | |
| 2414 push(@ssuStart, $ssuStart); # Add start of the SSU sequence | |
| 2415 push(@ssuEnd, $ssuEnd); # Add end of the SSU sequence | |
| 2416 push(@midStart, $midStart); # Add start of the 5.8S sequence | |
| 2417 push(@midEnd, $midEnd); # Add end of the 5.8S sequence | |
| 2418 push(@lsuStart, $lsuStart); # Add start of the LSU sequence | |
| 2419 push(@lsuEnd, $lsuEnd); # Add end of the LSU sequence | |
| 2420 push(@its1Start, $its1Start); # Add start of the ITS1 sequence | |
| 2421 push(@its1End, $its1End); # Add end of the ITS1 sequence | |
| 2422 push(@its2Start, $its2Start); # Add start of the ITS2 sequence | |
| 2423 push(@its2End, $its2End); # Add end of the ITS2 sequence | |
| 2424 push(@problem, $problem); # Add potential problem info | |
| 2425 push(@problemCode, $problemCode); # Add potential problem info in code form | |
| 2426 push(@anchorLens, $allhitanchorlens); # Add the anchor lengths | |
| 2427 } | |
| 2428 $lc++; # Increase the line count by one | |
| 2429 } | |
| 2430 | |
| 2431 ## Save results to the summary file | |
| 2432 if ($out_sum == 1) { # If summary file should be written | |
| 2433 $itsTotal = 0; # Reset the total ITS sum | |
| 2434 foreach $typeCount (@itsCounts) { # Add ITSs from all different origins | |
| 2435 $itsTotal += $typeCount; # Add the number of ITSs for this origin | |
| 2436 } | |
| 2437 ## Write info on the found ITS sequences to the summary file | |
| 2438 print SUMMARY "Sequences detected as ITS by ITSx:\t$itsTotal\n"; | |
| 2439 print SUMMARY " On main strand: \t$itsMain\n"; | |
| 2440 print SUMMARY " On complementary strand:\t$itsCompl\n"; | |
| 2441 if ($allow_reorder == 0) { # If re-ordering of domains is not allowed | |
| 2442 print SUMMARY "Sequences detected as chimeric by ITSx:\t$itsChimeric\n"; # Write the number of reported chimeric sequences to the summary file | |
| 2443 } | |
| 2444 ## Write info on the found ITS sequence types to the summary file | |
| 2445 print SUMMARY "ITS sequences by preliminary origin:\n"; | |
| 2446 print SUMMARY " Alveolates: \t" . int(@itsCounts[ord("A")]) . "\n"; | |
| 2447 print SUMMARY " Amoebozoa: \t" . int(@itsCounts[ord("D")]) . "\n"; | |
| 2448 print SUMMARY " Bacillariophyta: \t" . int(@itsCounts[ord("C")]) . "\n"; | |
| 2449 print SUMMARY " Brown algae: \t" . int(@itsCounts[ord("I")]) . "\n"; | |
| 2450 print SUMMARY " Bryophytes: \t" . int(@itsCounts[ord("B")]) . "\n"; | |
| 2451 print SUMMARY " Euglenozoa: \t" . int(@itsCounts[ord("E")]) . "\n"; | |
| 2452 print SUMMARY " Eustigmatophytes:\t" . int(@itsCounts[ord("U")]) . "\n"; | |
| 2453 print SUMMARY " Fungi: \t" . int(@itsCounts[ord("F")]) . "\n"; | |
| 2454 print SUMMARY " Green algae: \t" . int(@itsCounts[ord("G")]) . "\n"; | |
| 2455 print SUMMARY " Liverworts: \t" . int(@itsCounts[ord("L")]) . "\n"; | |
| 2456 print SUMMARY " Metazoa: \t" . int(@itsCounts[ord("M")]) . "\n"; | |
| 2457 print SUMMARY " Microsporidia: \t" . int(@itsCounts[ord("N")]) . "\n"; | |
| 2458 print SUMMARY " Oomycetes: \t" . int(@itsCounts[ord("O")]) . "\n"; | |
| 2459 print SUMMARY " Prymnesiophytes: \t" . int(@itsCounts[ord("P")]) . "\n"; | |
| 2460 print SUMMARY " Raphidophytes: \t" . int(@itsCounts[ord("Q")]) . "\n"; | |
| 2461 print SUMMARY " Red algae: \t" . int(@itsCounts[ord("H")]) . "\n"; | |
| 2462 print SUMMARY " Rhizaria: \t" . int(@itsCounts[ord("R")]) . "\n"; | |
| 2463 print SUMMARY " Synurophyceae: \t" . int(@itsCounts[ord("S")]) . "\n"; | |
| 2464 print SUMMARY " Tracheophyta: \t" . int(@itsCounts[ord("T")]) . "\n"; | |
| 2465 print SUMMARY "-----------------------------------------------------------------\n"; | |
| 2466 close (SUMMARY); # Close the summary file | |
| 2467 } | |
| 2468 | |
| 2469 if ($out_results == 1) { | |
| 2470 close (RESULTS); # Close the results file | |
| 2471 } | |
| 2472 close (RAWOUT); # Close the raw output file | |
| 2473 close (PROBLEM); # Close the file for problematic entries | |
| 2474 if ($foundProblem == 0) { | |
| 2475 `rm $output.problematic.txt 2> /dev/null`; | |
| 2476 } | |
| 2477 if ($out_pos == 1) { | |
| 2478 close (POS); # Close the positions file | |
| 2479 } | |
| 2480 if ($out_fasta == 1) { # If FASTA output is on | |
| 2481 close (FASTA); # Close the FASTA output file | |
| 2482 if ($allow_reorder == 0) { # If re-ordering of domains is not allowed | |
| 2483 close (CHIMERA); # Close the chimera file | |
| 2484 if ($foundChimera == 0) { | |
| 2485 `rm $output.chimeric.fasta 2> /dev/null`; | |
| 2486 } | |
| 2487 } | |
| 2488 if ($out_partial > 0) { | |
| 2489 close FULLPARTIAL; | |
| 2490 } | |
| 2491 } | |
| 2492 if ($out_ssu == 1) { | |
| 2493 close (SSU); # Close the SSU file | |
| 2494 } | |
| 2495 if ($out_lsu == 1) { | |
| 2496 close (LSU); # Close the LSU file | |
| 2497 } | |
| 2498 if ($out_58S == 1) { | |
| 2499 close (MID); # Close the 58S file | |
| 2500 } | |
| 2501 if ($out_its1 == 1) { | |
| 2502 close (ITS1); # Close the ITS1 file | |
| 2503 if ($out_partial > 0) { | |
| 2504 close ITS1PARTIAL; | |
| 2505 } | |
| 2506 } | |
| 2507 if ($out_its2 == 1) { | |
| 2508 close (ITS2); # Close the ITS2 file | |
| 2509 if ($out_partial > 0) { | |
| 2510 close ITS2PARTIAL; | |
| 2511 } | |
| 2512 } | |
| 2513 if ($out_concat == 1) { | |
| 2514 close CONCAT; | |
| 2515 } | |
| 2516 | |
| 2517 if ($out_not == 1) { # If not-found output is on | |
| 2518 open (NOTFOUND, "$output\_no_detections.txt"); # Open the not-found output file | |
| 2519 while ($line = <NOTFOUND>) { # Read all entries from file | |
| 2520 chomp($line); # Remove newline char | |
| 2521 push(@nodetectionlist,$line); # Add to non-detection list | |
| 2522 } | |
| 2523 close NOTFOUND; | |
| 2524 | |
| 2525 open (NOTFOUND, ">$output\_no_detections.fasta"); # Create a not-found FASTA output file | |
| 2526 foreach $seqID (@nodetectionlist) { # For all non-detections | |
| 2527 $seq = $sequenceDB{"$seqID"}; # Get sequence from sequence database | |
| 2528 print NOTFOUND ">$seqID\n"; # Print not found sequence ID | |
| 2529 print NOTFOUND $seq . "\n"; # Print not found sequence | |
| 2530 } | |
| 2531 close NOTFOUND; | |
| 2532 } | |
| 2533 | |
| 2534 ## Clean up and finish | |
| 2535 | |
| 2536 if ($pipeline == 0) { # If ITSx is not called from the pipeline mode (i.e. from ITSx) | |
| 2537 if ($save_raw == 1) { # If raw data should be saved | |
| 2538 `mv $tempDir $output\_ITSx_raw_output`; # Change the name of the temporary directory to ..._ITSx_raw_output | |
| 2539 } else { # Else, discard the raw data | |
| 2540 `rm -rf $tempDir`; # Remove the temporary directory | |
| 2541 } | |
| 2542 } | |
| 2543 | |
| 2544 ## Get the current time and output a finished message | |
| 2545 $now = localtime; | |
| 2546 if ($silent == 0) { | |
| 2547 print STDERR "$now : Extraction finished!\n"; | |
| 2548 print STDERR "-----------------------------------------------------------------\n"; | |
| 2549 print STDERR "Thank you for using ITSx!\n"; | |
| 2550 print STDERR "Please report bugs or unsupported lineages to itsx\@microbiology.se\n"; | |
| 2551 print STDERR "\n"; | |
| 2552 } | |
| 2553 | |
| 2554 ## Write end time a summary file | |
| 2555 if ($pipeline == 0) { # If not running in pipeline mode | |
| 2556 if ($out_sum == 1) { # If summary output is on | |
| 2557 open (SUMMARY, ">>$output.summary.txt"); # Append to the summary file | |
| 2558 print SUMMARY "ITSx run finished at $now.\n"; # Write ending time for the analysis | |
| 2559 close (SUMMARY); # Close summary file | |
| 2560 } | |
| 2561 } | |
| 2562 | |
| 2563 sub hmmerSearch { | |
| 2564 $hmmerCommand = $_[0]; | |
| 2565 $outputFile = $_[1]; | |
| 2566 $strand = $_[2]; | |
| 2567 $profileSet = $_[3]; | |
| 2568 open (HMMEROUTPUT, ">$outputFile"); | |
| 2569 open (HMMER, "$hmmerCommand |"); | |
| 2570 $totalHitCount = 0; | |
| 2571 $hitCount = 0; | |
| 2572 $SSUCount = 0; | |
| 2573 $LSUCount = 0; | |
| 2574 $startCount = 0; | |
| 2575 $endCount = 0; | |
| 2576 $maxCount = 1; | |
| 2577 while (chomp($line = <HMMER>)) { | |
| 2578 if (substr($line,0,6) eq "Query:") { | |
| 2579 $hitCount = 0; | |
| 2580 $SSUCount = 0; | |
| 2581 $LSUCount = 0; | |
| 2582 $startCount = 0; | |
| 2583 $endCount = 0; | |
| 2584 undef @bestScore; | |
| 2585 undef @bestEntry; | |
| 2586 | |
| 2587 $query = substr($line,7); | |
| 2588 $queryLength = $query; | |
| 2589 $queryLength =~ s/.* *//; | |
| 2590 $queryLength =~ s/[^0-9]//g; | |
| 2591 $query =~ s/ *//; | |
| 2592 $query =~ s/ *.*//; | |
| 2593 | |
| 2594 print HMMEROUTPUT "## New query:\t$query\t$queryLength\n"; | |
| 2595 | |
| 2596 } | |
| 2597 if (substr($line,0,12) eq "Description:") { | |
| 2598 $desc = $line; | |
| 2599 } | |
| 2600 if (substr($line,0,3) eq ">> ") { | |
| 2601 ($tempshit,$hmmerSubjectName) = split(' ',$line); | |
| 2602 } | |
| 2603 if ($line =~ m/[0-9] ! /) { | |
| 2604 $stats = $line; | |
| 2605 $stats =~ s/ */\t/g; | |
| 2606 ($empty,$no,$excl,$score) = split('\t',$stats); | |
| 2607 $hitCount++; | |
| 2608 $totalHitCount++; | |
| 2609 if ($maxCount == 0) { | |
| 2610 print HMMEROUTPUT "$query\t$hmmerSubjectName\t$queryLength$stats\n"; | |
| 2611 } else { | |
| 2612 if (substr($hmmerSubjectName,0,5) eq "1_SSU") { | |
| 2613 $SSUCount++; | |
| 2614 if ($SSUCount <= $maxCount) { | |
| 2615 @bestScore[1] = $score; | |
| 2616 @bestEntry[1] = "$query\t$hmmerSubjectName\t$queryLength$stats\n"; | |
| 2617 } else { | |
| 2618 if ($score > @bestScore[1]) { | |
| 2619 @bestScore[1] = $score; | |
| 2620 @bestEntry[1] = "$query\t$hmmerSubjectName\t$queryLength$stats\n"; | |
| 2621 } | |
| 2622 } | |
| 2623 } | |
| 2624 if (substr($hmmerSubjectName,0,5) eq "4_LSU") { | |
| 2625 $LSUCount++; | |
| 2626 if ($LSUCount <= $maxCount) { | |
| 2627 @bestScore[4] = $score; | |
| 2628 @bestEntry[4] = "$query\t$hmmerSubjectName\t$queryLength$stats\n"; | |
| 2629 } else { | |
| 2630 if ($score > @bestScore[4]) { | |
| 2631 @bestScore[4] = $score; | |
| 2632 @bestEntry[4] = "$query\t$hmmerSubjectName\t$queryLength$stats\n"; | |
| 2633 } | |
| 2634 } | |
| 2635 } | |
| 2636 if (substr($hmmerSubjectName,0,5) eq "2_5.8") { | |
| 2637 $startCount++; | |
| 2638 if ($startCount <= $maxCount) { | |
| 2639 @bestScore[2] = $score; | |
| 2640 @bestEntry[2] = "$query\t$hmmerSubjectName\t$queryLength$stats\n"; | |
| 2641 } else { | |
| 2642 if ($score > @bestScore[2]) { | |
| 2643 @bestScore[2] = $score; | |
| 2644 @bestEntry[2] = "$query\t$hmmerSubjectName\t$queryLength$stats\n"; | |
| 2645 } | |
| 2646 } | |
| 2647 } | |
| 2648 if (substr($hmmerSubjectName,0,5) eq "3_End") { | |
| 2649 $endCount++; | |
| 2650 if ($endCount <= $maxCount) { | |
| 2651 @bestScore[3] = $score; | |
| 2652 @bestEntry[3] = "$query\t$hmmerSubjectName\t$queryLength$stats\n"; | |
| 2653 } else { | |
| 2654 if ($score > @bestScore[3]) { | |
| 2655 @bestScore[3] = $score; | |
| 2656 @bestEntry[3] = "$query\t$hmmerSubjectName\t$queryLength$stats\n" | |
| 2657 } | |
| 2658 } | |
| 2659 } | |
| 2660 } | |
| 2661 } | |
| 2662 if (substr($line,0,2) eq "//") { | |
| 2663 if ($maxCount > 0) { | |
| 2664 if ($SSUCount > 0) { | |
| 2665 print HMMEROUTPUT @bestEntry[1]; | |
| 2666 } | |
| 2667 if ($startCount > 0) { | |
| 2668 print HMMEROUTPUT @bestEntry[2]; | |
| 2669 } | |
| 2670 if ($endCount > 0) { | |
| 2671 print HMMEROUTPUT @bestEntry[3]; | |
| 2672 } | |
| 2673 if ($LSUCount > 0) { | |
| 2674 print HMMEROUTPUT @bestEntry[4]; | |
| 2675 } | |
| 2676 } | |
| 2677 print HMMEROUTPUT "//\n"; | |
| 2678 } | |
| 2679 } | |
| 2680 close(HMMEROUTPUT); | |
| 2681 $now = localtime; | |
| 2682 if ($silent == 0) { | |
| 2683 if ($strand eq "M") { | |
| 2684 print STDERR " $now : " . ucfirst($profileIndex{$profileSet}) . " analysis of main strand finished.\n"; # Print finished type | |
| 2685 } else { | |
| 2686 print STDERR " $now : " . ucfirst($profileIndex{$profileSet}) . " analysis of complementary strand finished.\n"; # Print finished type | |
| 2687 } | |
| 2688 } | |
| 2689 } | |
| 2690 | |
| 2691 | |
| 2692 ## Please send beers, pizzas, cakes, fruit pies, job positions and other types of feedback to: | |
| 2693 ## johan.bengtsson [at] microbiology.se | |
| 2694 ## Looking forward to hearing from you.... visit my website: www.microbiology.se for info on my research | |
| 2695 ## //Johan Bengtsson, 2012-2014 | 
