Mercurial > repos > greg > p_chunks
comparison p_chunks.R @ 0:f45c65e3fd18 draft
Uploaded
| author | greg |
|---|---|
| date | Tue, 10 Jan 2023 20:40:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f45c65e3fd18 |
|---|---|
| 1 #!/bin/env Rscript | |
| 2 | |
| 3 library(parallel) | |
| 4 library(hash) | |
| 5 library(stringr) | |
| 6 library(grid) | |
| 7 library(gridExtra) | |
| 8 library(optparse) | |
| 9 | |
| 10 options(width = 180) | |
| 11 | |
| 12 printif = function(string = NULL, condition){ | |
| 13 if (condition) { | |
| 14 print(string) | |
| 15 } | |
| 16 } | |
| 17 | |
| 18 findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, | |
| 19 amrPSLFile = NULL, amrDatabase, noAMR = FALSE, | |
| 20 incPSLFile = NULL, incDatabase, noInc = FALSE, | |
| 21 outputDirectory = NA, overwrite = TRUE, | |
| 22 maxTargetLength = 300000, | |
| 23 minQueryLength = 500, | |
| 24 makeCircos = FALSE, | |
| 25 minQueryCoverage = 1/2, minTargetCoverage = 1/2, | |
| 26 searchDepth = NULL, | |
| 27 verbosity = 0) { | |
| 28 | |
| 29 ## Verify the arguments | |
| 30 argumentsGood = TRUE | |
| 31 if (minQueryCoverage < .1 || minQueryCoverage > 1) { | |
| 32 argumentsGood = FALSE | |
| 33 message(paste('Minimum query coverage', minQueryCoverage, 'is outside of the range 0.1 <= x <= 1')) | |
| 34 } | |
| 35 if (minTargetCoverage < 0.02 || minTargetCoverage > 1) { | |
| 36 argumentsGood = FALSE | |
| 37 message(paste('Minimum target coverage', minTargetCoverage, 'is outside of the range 0.1 <= x <= 1')) | |
| 38 } | |
| 39 if (!argumentsGood){ | |
| 40 message('There is a problem with the arguments') | |
| 41 return() | |
| 42 } | |
| 43 | |
| 44 printif(paste('Finding plasmids in', plasmidPSLFile), verbosity > 0) | |
| 45 | |
| 46 ## Keep track of the total score in case we doing a grid search | |
| 47 totalPlasmidScore = 0 | |
| 48 | |
| 49 ## Check for the existence of the output directory, remove if it exists | |
| 50 if (file.exists(outputDirectory)) { | |
| 51 printif(paste('Removing existing output directory', outputDirectory), verbosity > 1) | |
| 52 unlink(outputDirectory, recursive = TRUE) | |
| 53 } | |
| 54 printif(paste('Making output directory', outputDirectory), verbosity > 1) | |
| 55 dir.create(outputDirectory) | |
| 56 outputPrefix = paste0(outputDirectory, "/plasmids") | |
| 57 | |
| 58 ## Read in and filter the plasmid hits | |
| 59 plasmidHits = read.table(plasmidPSLFile, row.names = NULL, header = FALSE, sep = '\t', stringsAsFactors = FALSE, skip = 5) | |
| 60 colnames(plasmidHits) = c('match', 'mismatch', 'rep_m', 'Ns', 'tgap_c', 'tgap_b', | |
| 61 'qgap_c', 'qgap_b', 'strand', | |
| 62 'target', 'tlength', 'tstart', 'tstop', | |
| 63 'query', 'qlength', 'qstart', 'qstop', | |
| 64 'blocks', 'block_sizes', 'tstarts', 'qstarts') | |
| 65 printif(paste("Sequence-plasmid hits:", nrow(plasmidHits)), verbosity > 0) | |
| 66 | |
| 67 plasmidHits = plasmidHits[order(plasmidHits[,'target'], -plasmidHits[,'qlength']), ] | |
| 68 | |
| 69 ## Toss out any hits missing information | |
| 70 plasmidHits = plasmidHits[complete.cases(plasmidHits),] | |
| 71 | |
| 72 ## Toss out very long plasmid sequences -- probably actually genome chunks labeled incorrectly | |
| 73 veryLongHits = sum(plasmidHits[,'tlength'] >= maxTargetLength) | |
| 74 printif(paste('Removing', veryLongHits, 'hits greater than', maxTargetLength), verbosity > 0) | |
| 75 plasmidHits = plasmidHits[plasmidHits[,'tlength'] <= maxTargetLength, ] | |
| 76 printif(paste("Sequence-plasmid hits after removing very long plasmids:", nrow(plasmidHits)), verbosity > 0) | |
| 77 | |
| 78 ## Toss out very short query sequences -- probably junk or repeats | |
| 79 veryShortQuery = sum(plasmidHits[,'qlength'] >= minQueryLength) | |
| 80 printif(paste('Removing', veryShortQuery, 'queries less than', minQueryLength), verbosity > 0) | |
| 81 plasmidHits = plasmidHits[plasmidHits[,'qlength'] >= minQueryLength, ] | |
| 82 printif(paste("Sequence-plasmid hits after removing very short queries:", nrow(plasmidHits)), verbosity > 0) | |
| 83 | |
| 84 ## Toss out sequece-plasmid pairs below the coverage cutoff | |
| 85 sequenceMatches = aggregate(x = plasmidHits[,'match',drop = FALSE], | |
| 86 by = list(plasmidHits[,'query'], plasmidHits[,'target']), FUN = sum) | |
| 87 printif(head(sequenceMatches), verbosity > 1) | |
| 88 printif(paste('Sequence-plasmid pair matches:', paste(dim(sequenceMatches), collapse = 'x')), verbosity > 1) | |
| 89 | |
| 90 sequenceLengths = aggregate(x = plasmidHits[,'qlength', drop = FALSE], | |
| 91 by = list(plasmidHits[,'query'], plasmidHits[,'target']), FUN = max) | |
| 92 printif(head(sequenceLengths), verbosity > 1) | |
| 93 printif(paste('Sequence-plasmid pair lengths:', paste(dim(sequenceLengths), collapse = 'x')), verbosity > 1) | |
| 94 | |
| 95 matchingFractions = cbind(sequenceMatches[,c(1,2)], sequenceMatches[,3] / sequenceLengths[,3]) | |
| 96 colnames(matchingFractions) = c('query', 'target', 'fraction') | |
| 97 printif(head(matchingFractions), verbosity > 1) | |
| 98 printif(paste('Sequence-plasmid pair fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1) | |
| 99 | |
| 100 matchingFractions = matchingFractions[matchingFractions[,'fraction'] >= minQueryCoverage,] | |
| 101 printif(head(matchingFractions), verbosity > 1) | |
| 102 printif(paste('Passing sequence-plasmid pair fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1) | |
| 103 | |
| 104 aboveMinCoverage = apply(matchingFractions, 1, function(i){paste0(i['query'], '|', i['target'])}) | |
| 105 plasmidHits = plasmidHits[apply(plasmidHits, 1, function(i){paste0(i['query'], '|', i['target'])}) %in% aboveMinCoverage, ] | |
| 106 printif(paste("Sequence-plasmid hits after removing low-coverage hits:", nrow(plasmidHits)), verbosity > 0) | |
| 107 | |
| 108 ## Toss out plasmid sequences below the coverage cutoff | |
| 109 targetMatches = aggregate(x = plasmidHits[,'match',drop = FALSE], | |
| 110 by = list(plasmidHits[,'target']), FUN = sum) | |
| 111 printif(head(targetMatches), verbosity > 1) | |
| 112 printif(paste('Plasmid matches:', paste(dim(targetMatches), collapse = 'x')), verbosity > 1) | |
| 113 | |
| 114 targetLengths = aggregate(x = plasmidHits[,'tlength', drop = FALSE], | |
| 115 by = list(plasmidHits[,'target']), FUN = max) | |
| 116 printif(head(targetLengths), verbosity > 1) | |
| 117 printif(paste('Plasmid lengths:', paste(dim(targetLengths), collapse = 'x')), verbosity > 1) | |
| 118 | |
| 119 matchingFractions = cbind(targetMatches[,1], targetMatches[,2] / targetLengths[,2]) | |
| 120 colnames(matchingFractions) = c('target', 'fraction') | |
| 121 printif(head(matchingFractions), verbosity > 1) | |
| 122 printif(paste('Plasmid fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1) | |
| 123 | |
| 124 matchingFractions = matchingFractions[matchingFractions[,'fraction'] >= minTargetCoverage,] | |
| 125 printif(head(matchingFractions), verbosity > 1) | |
| 126 printif(paste('Passing plasmid fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1) | |
| 127 | |
| 128 aboveMinCoverage = matchingFractions[, 'target'] | |
| 129 plasmidHits = plasmidHits[plasmidHits[, 'target'] %in% aboveMinCoverage, ] | |
| 130 printif(paste("Sequence-plasmid hits after removing low-coverage hits:", nrow(plasmidHits)), verbosity > 0) | |
| 131 | |
| 132 | |
| 133 ## If we're out of sequece-plasmid hits, then stop here | |
| 134 if (nrow(plasmidHits) == 0) { | |
| 135 message(paste('Not hits found')) | |
| 136 return | |
| 137 } | |
| 138 | |
| 139 ## Find out how much of each query (contig) is covered by each target (plasmid). | |
| 140 ## Query coverage is constant and does not change as we assign contigs to plasmids | |
| 141 queryCoverage = hash() | |
| 142 queryMismatches = hash() | |
| 143 for (i in 1:nrow(plasmidHits)) { | |
| 144 if (!(i %% 1000)) { | |
| 145 printif(paste('Processing hit', i, '/', nrow(plasmidHits)), verbosity > 0) | |
| 146 } | |
| 147 | |
| 148 query = plasmidHits[i,'query'] | |
| 149 target = plasmidHits[i, 'target'] | |
| 150 | |
| 151 ## Represent each sequence-plasmid hit as a series of 0/1 vectors that | |
| 152 if (!has.key(query, queryCoverage)) { | |
| 153 queryCoverage[[query]] = hash() | |
| 154 queryMismatches[[query]] = hash() | |
| 155 } | |
| 156 if (!has.key(target, queryCoverage[[query]])) { | |
| 157 queryCoverage[[query]][[target]] = rep(0, times = plasmidHits[i, 'qlength']) | |
| 158 queryMismatches[[query]][[target]] = 0 | |
| 159 } | |
| 160 | |
| 161 blockSizes = as.numeric(unlist(strsplit(x = plasmidHits[i,'block_sizes'], ','))) | |
| 162 qBlockStarts = as.numeric(unlist(strsplit(x = plasmidHits[i,'qstarts'], ','))) | |
| 163 | |
| 164 for (j in 1:length(blockSizes)) { | |
| 165 queryCoverage[[query]][[target]][qBlockStarts[j]:(qBlockStarts[j]+blockSizes[j])] = 1 | |
| 166 } | |
| 167 queryMismatches[[query]][[target]] = queryMismatches[[query]][[target]] + plasmidHits[i,'mismatch'] | |
| 168 } | |
| 169 | |
| 170 | |
| 171 ## Pull the full plasmid names from the blast database because BLAT/minimap2 doesn't report them, just the ID's | |
| 172 targetIDs = plasmidHits[,'target'] | |
| 173 targetIDs = gsub("\\|$", "", targetIDs) | |
| 174 targetIDs = gsub(".*(\\|.*)$", "\\1", targetIDs) | |
| 175 noDotIDs = gsub("\\|", "", targetIDs) | |
| 176 noDotIDs = gsub("(^H[^.]+).[0-9]+$", "\\1", noDotIDs) | |
| 177 noDotIDs = cbind(noDotIDs) | |
| 178 | |
| 179 targetFile = paste0(outputDirectory, '/targets.tsv', sep = '') | |
| 180 write.table(file = targetFile, x = noDotIDs, quote = FALSE, row.names = FALSE, col.names = FALSE) | |
| 181 command = paste('blastdbcmd -db', plasmidDatabase, | |
| 182 '-entry_batch', targetFile, | |
| 183 '| grep ">"') | |
| 184 targetNames = system(command, intern = TRUE) | |
| 185 printif(paste('Found', length(targetNames), 'target names for', length(targetIDs), 'targets.'), verbosity > 0) | |
| 186 | |
| 187 targetNames = gsub('^>.*\\| ', '', targetNames) | |
| 188 targetNames = gsub('^>[^ ]+', '', targetNames) | |
| 189 plasmidHits = cbind(plasmidHits, targetIDs, targetNames) | |
| 190 printif(paste('Named hits:', nrow(plasmidHits)), verbosity > 1) | |
| 191 | |
| 192 #Pull just the plasmids out of the larget set of hits, i.e, make sure it has the word 'plasmid' in the description. | |
| 193 plasmidHits = plasmidHits[grep('plasmid|vector', plasmidHits[,'targetNames'], ignore.case = TRUE), ,drop = FALSE] | |
| 194 plasmidHits = plasmidHits[!grepl('tig0000|unnamed', plasmidHits[,'targetNames'], ignore.case = TRUE), , drop = FALSE] | |
| 195 printif(paste("Sequece-plasmid hits after screening by name:", paste(dim(plasmidHits), collapse = 'x')), verbosity > 1) | |
| 196 | |
| 197 ## Stop if there is nothing left | |
| 198 if (is.null(plasmidHits)) { | |
| 199 message('Not hits found') | |
| 200 return() | |
| 201 } | |
| 202 if (nrow(plasmidHits) == 0) { | |
| 203 message('Not hits found') | |
| 204 return() | |
| 205 } | |
| 206 | |
| 207 ## Clean up the plasmid names -- they look like crap by default. | |
| 208 plasmidNames = plasmidHits[,'targetNames'] | |
| 209 plasmidNames = gsub(', comp.*', '', plasmidNames) | |
| 210 plasmidNames = gsub(', contig.*', '', plasmidNames) | |
| 211 plasmidNames = gsub(', partial.*', '', plasmidNames) | |
| 212 plasmidNames = gsub('strain ', '', plasmidNames) | |
| 213 plasmidNames = gsub('^ *', '', plasmidNames) | |
| 214 plasmidNames = sub('^(cl\\|)(.*?) ', '', plasmidNames) | |
| 215 plasmidNames = sub('subsp. (.*?) ', '', plasmidNames) | |
| 216 plasmidNames = sub('serovar (.*?) ', '', plasmidNames) | |
| 217 plasmidNames = sub('strain ', '', plasmidNames) | |
| 218 plasmidNames = sub('plasmid$', '', plasmidNames) | |
| 219 plasmidHits[,'targetNames'] = plasmidNames | |
| 220 | |
| 221 ## Just take the best hit for each plasmid, hence the head, 1 in the agg | |
| 222 plasmidNames = aggregate(plasmidHits, by = list(plasmidHits[,'query']), FUN = head, 1) | |
| 223 plasmidNames = plasmidNames[, 'targetNames', drop = FALSE] | |
| 224 | |
| 225 ## Find the set of plasmid coverage hits for each itteration | |
| 226 usedContigs = c() | |
| 227 plasmidMismatches = c() | |
| 228 | |
| 229 ## Order hits by the plasmid ID and the query length | |
| 230 plasmidHits = plasmidHits[order(plasmidHits[,'target'], -plasmidHits[,'qlength']), ] | |
| 231 | |
| 232 ## Iterate, finding plasmids until we run out of usable sequence-plasmid its | |
| 233 plasmidNumber = 0 | |
| 234 plasmidResults = c() | |
| 235 while (1) { | |
| 236 | |
| 237 ## Keep track of how many plasmids we have gone over | |
| 238 plasmidNumber = plasmidNumber + 1 | |
| 239 | |
| 240 printif(paste('Sequence-plasmid hits left:', nrow(plasmidHits)), verbosity > 1) | |
| 241 contigToPlasmid = hash() | |
| 242 plasmidToContig = hash() | |
| 243 plasmidCoverage = hash() | |
| 244 plasmidCoverageWithRepeats = hash() | |
| 245 contigCoverage = hash() | |
| 246 | |
| 247 ##Find contig/plasmid plasmid/contig pairs | |
| 248 if (is.null(plasmidHits)) { | |
| 249 break | |
| 250 } | |
| 251 if (nrow(plasmidHits) == 0) { | |
| 252 break | |
| 253 } | |
| 254 repLengths = c() | |
| 255 | |
| 256 | |
| 257 ## Find the coverage of each plasmid in the possible set by the contigs | |
| 258 for (i in 1:nrow(plasmidHits)) { | |
| 259 | |
| 260 query = plasmidHits[i,'query'] | |
| 261 target = plasmidHits[i,'target'] | |
| 262 matches = plasmidHits[i,'match'] | |
| 263 mismatches = plasmidHits[i,'mismatch'] | |
| 264 score = matches - mismatches | |
| 265 queryLength = plasmidHits[i,'qlength'] | |
| 266 blockSizes = as.numeric(unlist(strsplit(plasmidHits[i, 'block_sizes'], ','))) | |
| 267 queryStarts = as.numeric(unlist(strsplit(plasmidHits[i, 'qstarts'], ','))) | |
| 268 targetStarts = as.numeric(unlist(strsplit(plasmidHits[i, 'tstarts'], ','))) | |
| 269 | |
| 270 ## Skip matches which have less than 50% of the bases from the contig on the plasmid -- probably not a good match | |
| 271 if ((sum(queryCoverage[[query]][[target]]) - queryMismatches[[query]][[target]]) / queryLength <= minQueryCoverage) { | |
| 272 next | |
| 273 } | |
| 274 | |
| 275 targetLength = plasmidHits[i, 'tlength']; targetStart = plasmidHits[i, 'tstart']; targetStop = plasmidHits[i, 'tstop'] | |
| 276 | |
| 277 ## Relate this contig to this plasmid | |
| 278 if (!has.key(query, contigToPlasmid)) { | |
| 279 contigToPlasmid[[query]] = hash() | |
| 280 contigCoverage[[query]] = hash() | |
| 281 } | |
| 282 if (!has.key(target, contigToPlasmid[[query]])) { | |
| 283 contigToPlasmid[[query]][[target]] = score | |
| 284 contigCoverage[[query]][[target]] = rep(0, queryLength) | |
| 285 } else { | |
| 286 contigToPlasmid[[query]][[target]] = contigToPlasmid[[query]][[target]] + score | |
| 287 } | |
| 288 | |
| 289 ## Keep track of target(plasmid) coverage by the contigs | |
| 290 if (!has.key(target, plasmidCoverage)) { | |
| 291 plasmidCoverage[[target]] = rep(0, targetLength) | |
| 292 plasmidCoverageWithRepeats[[target]] = rep(0, targetLength) | |
| 293 plasmidToContig[[target]] = hash() | |
| 294 plasmidMismatches[target] = 0 | |
| 295 } | |
| 296 | |
| 297 penalized = FALSE | |
| 298 for (j in 1:length(blockSizes)) { | |
| 299 | |
| 300 ## Keep track of all contig alignments to this plasmid, even with repeats | |
| 301 plasmidCoverageWithRepeats[[target]][targetStarts[j]:(targetStarts[j] + blockSizes[j])] = 1 | |
| 302 | |
| 303 ## Skip if this region of the query sequence has already been assigned to this plasmid | |
| 304 if (sum(contigCoverage[[query]][[target]][queryStarts[j]:(queryStarts[j] + blockSizes[j])] == 0) <= 50) { | |
| 305 printif(paste('Sequence', query, 'already used for', target, | |
| 306 '. ', paste0(queryStarts[j], '-', queryStarts[j] + blockSizes[j])), verbosity > 2) | |
| 307 next | |
| 308 } | |
| 309 if (!penalized) { | |
| 310 ## Penalty for every gap, only penalize once per match | |
| 311 ## TODO: Penalize for gap length, not just once per gap | |
| 312 #plasmidMismatches[target] = plasmidMismatches[target] + (length(blockSizes) - 1) * 100 | |
| 313 plasmidMismatches[target] = plasmidMismatches[target] + mismatches * 5 | |
| 314 penalized = TRUE | |
| 315 } | |
| 316 | |
| 317 plasmidCoverage[[target]][targetStarts[j]:(targetStarts[j] + blockSizes[j])][contigCoverage[[query]][[target]][queryStarts[j]:(queryStarts[j] + blockSizes[j])] == 0] = | |
| 318 plasmidCoverage[[target]][targetStarts[j]:(targetStarts[j] + blockSizes[j])][contigCoverage[[query]][[target]][queryStarts[j]:(queryStarts[j] + blockSizes[j])] == 0] + 1 | |
| 319 contigCoverage[[query]][[target]][queryStarts[j]:(queryStarts[j] + blockSizes[j])] = 1 | |
| 320 } | |
| 321 | |
| 322 ## Relate this plasmid to this contig | |
| 323 if (!has.key(query, plasmidToContig[[target]])) { | |
| 324 plasmidToContig[[target]][[query]] = score | |
| 325 } else { | |
| 326 plasmidToContig[[target]][[query]] = plasmidToContig[[target]][[query]] + score | |
| 327 } | |
| 328 if (target == 'NZ_GG692894.1' && query == 'contig_3_0') { | |
| 329 print('NZ_GG692894.1') | |
| 330 print(sum(plasmidCoverage[[target]])) | |
| 331 print(sum(contigCoverage[[query]][[target]])) | |
| 332 } | |
| 333 | |
| 334 } | |
| 335 | |
| 336 ## Get the best set of plasmids out, i.e., the set with the most bases matching between the contig and plasmid | |
| 337 plasmidScores = c() | |
| 338 for (thisPlasmid in keys(plasmidCoverage)){ | |
| 339 thisPlasmidScore = sum(plasmidCoverage[[thisPlasmid]]) | |
| 340 plasmidScores = c(plasmidScores, thisPlasmidScore) | |
| 341 names(plasmidScores)[length(plasmidScores)] = thisPlasmid | |
| 342 } | |
| 343 plasmidScores = sort(plasmidScores - plasmidMismatches[names(plasmidScores)], dec = TRUE) | |
| 344 | |
| 345 if (length(plasmidScores) > 0) { | |
| 346 printif('Highest scoring plasmids', verbosity > 1) | |
| 347 printif(head(cbind(plasmidScores), 20), verbosity > 1) | |
| 348 } | |
| 349 | |
| 350 ## Stop searching for plasmids if nothing matches well or we're out of hits | |
| 351 if (length(plasmidScores) == 0) { | |
| 352 printif('Out of plasmids', verbosity > 0) | |
| 353 break | |
| 354 } else if (max(plasmidScores) < 500) { | |
| 355 printif('Out of min-scoring plasmids', verbosity > 0) | |
| 356 break | |
| 357 } | |
| 358 | |
| 359 ## For each matching plasmid, ordered by total bases matching the assembly, find the set of corresponding contigs | |
| 360 plasmidToUse = 1 | |
| 361 if (!is.null(searchDepth) && plasmidNumber <= length(searchDepth)) { | |
| 362 plasmidToUse = searchDepth[plasmidNumber] | |
| 363 } | |
| 364 if (plasmidToUse > length(plasmidScores)) { | |
| 365 plasmidToUse = length(plasmidScores) | |
| 366 } | |
| 367 | |
| 368 plasmid = names(plasmidScores)[plasmidToUse] | |
| 369 print(paste('Plasmid picked', plasmid)) | |
| 370 totalPlasmidScore = totalPlasmidScore + plasmidScores[plasmid] | |
| 371 printif(paste("Pulling sequences for", plasmid), verbosity > 0) | |
| 372 | |
| 373 ## Find contigs what haven't already been given to another plasmid so we can assign them next round | |
| 374 plasmidContigs = keys(plasmidToContig[[plasmid]]) | |
| 375 unusedContigs = plasmidContigs[!(plasmidContigs %in% usedContigs)] | |
| 376 | |
| 377 ## If no unused contigs that also map to this plasmid then skip it | |
| 378 if (length(unusedContigs) == 0) { | |
| 379 printif(paste("No unused sequences for", plasmid), verbosity > 1) | |
| 380 next | |
| 381 } | |
| 382 | |
| 383 ## Keep just the rows for this plasmid and which haven't already been used by another plasmid | |
| 384 plasmidRows = plasmidHits[plasmidHits[,'target'] == plasmid,] | |
| 385 plasmidRows = plasmidRows[plasmidRows[,'query'] %in% unusedContigs,,drop = FALSE] | |
| 386 plasmidName = plasmidRows[1, 'targetNames'] | |
| 387 plasmidID = plasmidRows[1, 'targetIDs'] | |
| 388 ## Get the plasmid length | |
| 389 command = paste('blastdbcmd -db', plasmidDatabase, | |
| 390 '-entry', plasmidID, | |
| 391 '-outfmt "%l"') | |
| 392 plasmidLength = system(command, intern = TRUE) | |
| 393 plasmidLength = rep(plasmidLength, nrow(plasmidRows)) | |
| 394 | |
| 395 ## How many bases from the plasmid are uncovered? | |
| 396 plasmidMissing = rep(sum(plasmidCoverageWithRepeats[[as.character(plasmidID)]] == 0), nrow(plasmidRows)) | |
| 397 | |
| 398 ## Keep track of all of the contigs included | |
| 399 usedContigs = c(usedContigs, unusedContigs) | |
| 400 | |
| 401 ## How many matching bases for each query sequence? | |
| 402 thisPlasmidQuerySizes = c() | |
| 403 thisPlasmidMatches = c() | |
| 404 for (contig in unusedContigs) { | |
| 405 thisPlasmidQuerySizes = c(thisPlasmidQuerySizes, length(queryCoverage[[contig]][[plasmid]])) | |
| 406 thisPlasmidMatches = c(thisPlasmidMatches, sum(queryCoverage[[contig]][[plasmid]])) | |
| 407 } | |
| 408 | |
| 409 ## Add this plasmid's hits onto the growing list of sequence-plasmid hits | |
| 410 thisPlasmidResults = cbind(unusedContigs, plasmidName, as.character(plasmidID), thisPlasmidQuerySizes, thisPlasmidMatches, plasmidLength, plasmidMissing) | |
| 411 colnames(thisPlasmidResults) = c('query.name', 'plasmid.name', 'plasmid.accession', 'query.size', 'aligned.bases', 'plasmid.size', 'plasmid.missing') | |
| 412 thisPlasmidResults = thisPlasmidResults[order(-thisPlasmidMatches), ] | |
| 413 plasmidResults = rbind(plasmidResults, thisPlasmidResults) | |
| 414 | |
| 415 ## Remove the contigs added to this plasmid from the list of plasmid/contig BLAT hits | |
| 416 plasmidHits = plasmidHits[!(plasmidHits[,'query'] %in% usedContigs),] | |
| 417 plasmidHits = plasmidHits[!(plasmidHits[,'target'] == plasmid),] | |
| 418 } | |
| 419 | |
| 420 rownames(plasmidResults) = plasmidResults[,1] | |
| 421 | |
| 422 ## Check for the presence of AMR genes in this file | |
| 423 if (!noAMR) { | |
| 424 amrBEDFile = paste0(outputDirectory, '/amrMapping.bed') | |
| 425 command = paste('cat', amrPSLFile, | |
| 426 '| awk -F \'\\t\' \'($3 >= 80) && ($4 / $14 >= .95){OFS = "\t"; print $2,($9 < $10 ? $9 : $10),($9 < $10 ? $10 : $9),$1,$3/100,($9 < $10 ? "+" : "-")}\'', | |
| 427 '| sort -k 1,1 -k 2,2n >', amrBEDFile) | |
| 428 printif(command, verbosity > 1) | |
| 429 system(command) | |
| 430 ## Find local overlapping regions | |
| 431 amrMergedBEDFile = paste0(outputDirectory, '/amrMergedMapping.bed') | |
| 432 command = paste('bedtools merge -d -30 -i', amrBEDFile, '>', amrMergedBEDFile) | |
| 433 printif(command, verbosity > 1) | |
| 434 system(command) | |
| 435 ## Find the best AMR gene for each region | |
| 436 amrFinalBEDFile = paste0(outputDirectory, '/amrFinal.bed') | |
| 437 command = paste('bedtools intersect', | |
| 438 '-a', amrBEDFile, | |
| 439 '-b', amrMergedBEDFile, | |
| 440 '-f .9 -F .9', | |
| 441 '-wao', | |
| 442 '| awk \'$7 != "."\'', | |
| 443 '| awk \'{OFS="\t";locus=$7"\t"$8"\t"$9; if($5 > s[locus]){s[locus]=$5;b[locus] = $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}}END{for(i in b){print i,b[i]}}\'', | |
| 444 '>', amrFinalBEDFile) | |
| 445 printif(command, verbosity > 1) | |
| 446 system(command) | |
| 447 | |
| 448 ## Read the AMR results in and add them to the plasmid contigs | |
| 449 amrResults = read.table(file = amrFinalBEDFile, header = FALSE, row.names = NULL, stringsAsFactors = FALSE, quote = '') | |
| 450 amrResults[,7] = gsub('(_.*$)|(.*\\|)', '', amrResults[,7]) | |
| 451 amrResults = aggregate(amrResults[ , 7, drop = FALSE], by = list(amrResults[,1]), function(i){paste(i, collapse = ', ')}) | |
| 452 rownames(amrResults) = amrResults[,1] | |
| 453 amrResults = amrResults[ , 2, drop = FALSE] | |
| 454 print(amrResults) | |
| 455 | |
| 456 plasmidResults = cbind(plasmidResults, rep('', nrow(plasmidResults))) | |
| 457 colnames(plasmidResults)[ncol(plasmidResults)] = 'amr' | |
| 458 plasmidResults[rownames(plasmidResults) %in% rownames(amrResults), 'amr'] = | |
| 459 amrResults[rownames(plasmidResults)[rownames(plasmidResults) %in% rownames(amrResults)], 1] | |
| 460 | |
| 461 } | |
| 462 | |
| 463 ## Check for the presence of incompatibility groups in this file | |
| 464 if (!noInc) { | |
| 465 incBEDFile = paste0(outputDirectory, '/incMapping.bed') | |
| 466 command = paste('cat', incPSLFile, | |
| 467 '| awk -F \'\\t\' \'($3 >= 80) && ($4 / $14 >= .95){OFS = "\t"; print $2,($9 < $10 ? $9 : $10),($9 < $10 ? $10 : $9),$1,$3/100,($9 < $10 ? "+" : "-")}\'', | |
| 468 '| sort -k 1,1 -k 2,2n >', incBEDFile) | |
| 469 printif(command, verbosity > 1) | |
| 470 system(command) | |
| 471 ## Find local overlapping regions | |
| 472 incMergedBEDFile = paste0(outputDirectory, '/incMergedMapping.bed') | |
| 473 command = paste('bedtools merge -d -30 -i', incBEDFile, '>', incMergedBEDFile) | |
| 474 printif(command, verbosity > 1) | |
| 475 system(command) | |
| 476 ## Find the best INC group for each region | |
| 477 incFinalBEDFile = paste0(outputDirectory, '/incFinal.bed') | |
| 478 command = paste('bedtools intersect', | |
| 479 '-a', incBEDFile, | |
| 480 '-b', incMergedBEDFile, | |
| 481 '-f .9 -F .9', | |
| 482 '-wao', | |
| 483 '| awk \'$7 != "."\'', | |
| 484 '| awk \'{OFS="\t";locus=$7"\t"$8"\t"$9; if($5 > s[locus]){s[locus]=$5;b[locus] = $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}}END{for(i in b){print i,b[i]}}\'', | |
| 485 '>', incFinalBEDFile) | |
| 486 printif(command, verbosity > 1) | |
| 487 system(command) | |
| 488 | |
| 489 ## Read the inc group results in and add them to the plasmid contigs | |
| 490 incResults = read.table(file = incFinalBEDFile, header = FALSE, row.names = NULL, stringsAsFactors = FALSE, quote = '') | |
| 491 incResults[,7] = gsub('(_.*$)|(.*\\|)', '', incResults[,7]) | |
| 492 incResults = aggregate(incResults[ , 7, drop = FALSE], by = list(incResults[,1]), function(i){paste(i, collapse = ', ')}) | |
| 493 rownames(incResults) = incResults[,1] | |
| 494 incResults = incResults[ , 2, drop = FALSE] | |
| 495 print(incResults) | |
| 496 | |
| 497 plasmidResults = cbind(plasmidResults, rep('', nrow(plasmidResults))) | |
| 498 colnames(plasmidResults)[ncol(plasmidResults)] = 'inc' | |
| 499 plasmidResults[rownames(plasmidResults) %in% rownames(incResults), 'inc'] = | |
| 500 incResults[rownames(plasmidResults)[rownames(plasmidResults) %in% rownames(incResults)], 1] | |
| 501 } | |
| 502 | |
| 503 ## Write the plasmid results to file | |
| 504 plasmidChunksFile = paste0(outputDirectory, '/plasmids.tsv') | |
| 505 write.table(file = plasmidChunksFile, x = plasmidResults, quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE) | |
| 506 | |
| 507 ## Dump a sequence file of potential plasmid contigs | |
| 508 plasmidSequenceFile = paste0(outputPrefix, '.fna') | |
| 509 system(paste0('echo "" >', plasmidSequenceFile)) | |
| 510 for (contig in plasmidResults[,'query.name']) { | |
| 511 command = paste('faidx', paste0('assembly.fasta'), contig, '>>', plasmidSequenceFile) | |
| 512 print(command) | |
| 513 #system(command) | |
| 514 } | |
| 515 | |
| 516 | |
| 517 ## Return the total score of this round, in case we are doing a search | |
| 518 return(totalPlasmidScore) | |
| 519 | |
| 520 } | |
| 521 | |
| 522 pChunks = function(plasmidPSLFile = NULL, plasmidDatabase = NULL, | |
| 523 amrPSLFile = NULL, amrDatabase, noAMR = FALSE, | |
| 524 incPSLFile = NULL, incDatabase, noInc = FALSE, | |
| 525 outputDirectory = NA, overwrite = TRUE, | |
| 526 maxTargetLength = 300000, | |
| 527 minQueryLength = 200, | |
| 528 makeCircos = FALSE, | |
| 529 minQueryCoverage = 1/2, minTargetCoverage = 1/2, | |
| 530 searchDepth = c(1), | |
| 531 threads = 1, | |
| 532 verbosity = 2) { | |
| 533 | |
| 534 print(plasmidDatabase) | |
| 535 | |
| 536 ## Verify the arguments | |
| 537 argumentsGood = TRUE | |
| 538 if (!file.exists(plasmidPSLFile)) { | |
| 539 argumentsGood = FALSE | |
| 540 message(paste('Plasmid PSL file', plasmidPSLFile, 'not found')) | |
| 541 } | |
| 542 if (is.na(outputDirectory)) { | |
| 543 argumentsGood = FALSE | |
| 544 message('Output directory not given') | |
| 545 } | |
| 546 if (file.exists(outputDirectory) && !overwrite) { | |
| 547 argumentsGood = FALSE | |
| 548 message(paste('Output directory', outputDirectory, 'already exists. Add overwrite = TRUE')) | |
| 549 } | |
| 550 if (minQueryCoverage < .1 || minQueryCoverage > 1) { | |
| 551 argumentsGood = FALSE | |
| 552 message(paste('Minimum query coverage', minQueryCoverage, 'is outside of the range 0.1 <= x <= 1')) | |
| 553 } | |
| 554 if (minTargetCoverage < 0.02 || minTargetCoverage > 1) { | |
| 555 argumentsGood = FALSE | |
| 556 message(paste('Minimum target coverage', minTargetCoverage, 'is outside of the range 0.1 <= x <= 1')) | |
| 557 } | |
| 558 if (!argumentsGood){ | |
| 559 message('There is a problem with the arguments') | |
| 560 return() | |
| 561 } | |
| 562 | |
| 563 ## Check for the existence of the output directory, remove if it exists | |
| 564 if (file.exists(outputDirectory)) { | |
| 565 printif(paste('Removing existing output directory', outputDirectory), verbosity > 1) | |
| 566 unlink(outputDirectory, recursive = TRUE) | |
| 567 } | |
| 568 printif(paste('Making output directory', outputDirectory), verbosity > 1) | |
| 569 dir.create(outputDirectory) | |
| 570 | |
| 571 ## Default to c(1) for the plasmid search depth | |
| 572 searchDepths = lapply(searchDepth, function(i){seq(i, 1)}) | |
| 573 searchDepths = as.matrix(expand.grid(searchDepths)) | |
| 574 print(searchDepths) | |
| 575 | |
| 576 plasmidScores = mclapply(1:nrow(searchDepths), function(i) { | |
| 577 findPlasmids(plasmidPSLFile = plasmidPSLFile, plasmidDatabase = plasmidDatabase, | |
| 578 amrPSLFile = amrPSLFile, amrDatabase, noAMR = noAMR, | |
| 579 incPSLFile = incPSLFile, incDatabase, noInc = noInc, | |
| 580 outputDirectory = paste0(outputDirectory, '/plasmids_', paste(searchDepths[i,], collapse = '_')), overwrite = overwrite, | |
| 581 maxTargetLength = 300000, | |
| 582 minQueryLength = 200, | |
| 583 makeCircos = makeCircos, | |
| 584 minQueryCoverage = 1/2, minTargetCoverage = 1/2, | |
| 585 searchDepth = searchDepths[i,], ## Search depth i | |
| 586 verbosity = verbosity) | |
| 587 }, | |
| 588 mc.cores = threads) | |
| 589 plasmidScores = unlist(plasmidScores) | |
| 590 | |
| 591 print(cbind(searchDepths, plasmidScores)) | |
| 592 | |
| 593 ## Pick out the best set, penalizing for not taking the first | |
| 594 penalties = (unlist(apply(searchDepths, 1, sum)) - ncol(searchDepths)) * 2500 | |
| 595 print(penalties) | |
| 596 plasmidScores = plasmidScores - penalties | |
| 597 print(cbind(searchDepths, plasmidScores)) | |
| 598 bestScoring = which.max(plasmidScores) | |
| 599 bestScoringDirectory = paste0(outputDirectory, '/plasmids_', paste(searchDepths[bestScoring,], collapse = '_')) | |
| 600 | |
| 601 ## Link to the best scoring files | |
| 602 files = c('plasmids.tsv', 'amrFinal.bed') | |
| 603 commands = paste('ln -s ', paste0('"', bestScoringDirectory, '/', files, '"'), | |
| 604 paste0(outputDirectory, '/', files)) | |
| 605 printif(commands, verbosity >= 2) | |
| 606 lapply(commands, system) | |
| 607 | |
| 608 } | |
| 609 | |
| 610 | |
| 611 optionList = list( | |
| 612 make_option('--plasmid_psl', type = 'character', default = NULL, | |
| 613 help = 'Plasmid PSL database-v-contig output', metavar = '<PSL_FILE>'), | |
| 614 make_option('--plasmid_database', type = 'character', default = NULL, | |
| 615 help = 'Plasmid database', metavar = '<PLASMID_FASTA>'), | |
| 616 make_option('--amr_database', type = 'character', default = NULL, | |
| 617 help = 'AMR database', metavar = '<AMR_FASTA>'), | |
| 618 make_option('--amr_blast', type = 'character', default = NULL, | |
| 619 help = 'AMR blast output', metavar = '<BLAST_6>'), | |
| 620 make_option('--output', type = 'character', default = NULL, | |
| 621 help = 'Output dir', metavar = '<OUTPUT_DIR>'), | |
| 622 make_option('--threads', type = 'numeric', default = NULL, | |
| 623 help = 'Output dir', metavar = '<OUTPUT_DIR>'), | |
| 624 make_option('--no_amr', action = 'store_true', default = FALSE, | |
| 625 help = 'Don\'t run AMR'), | |
| 626 make_option('--no_inc', action = 'store_true', default = FALSE, | |
| 627 help = 'Don\'t run incompatibility groups') | |
| 628 ) | |
| 629 | |
| 630 optParser = OptionParser(option_list = optionList) | |
| 631 opt = parse_args(optParser) | |
| 632 print(opt) | |
| 633 pChunks(plasmidPSLFile = opt$'plasmid_psl', plasmidDatabase = opt$'plasmid_database', | |
| 634 amrPSLFile = opt$'amr_blast', amrDatabase = opt$'amr_database', | |
| 635 outputDirectory = opt$output, | |
| 636 threads = opt$threads, | |
| 637 # searchDepth = c(1,1), | |
| 638 searchDepth = c(5,5), | |
| 639 noAMR = TRUE, noInc = TRUE, | |
| 640 verbosity = 2) | |
| 641 | |
| 642 | |
| 643 |
