0
|
1 #!/bin/env Rscript
|
|
2
|
|
3 library(parallel)
|
|
4 library(hash)
|
|
5 library(stringr)
|
|
6 library(grid)
|
|
7 library(gridExtra)
|
|
8 library(optparse)
|
|
9
|
|
10 options(width = 180)
|
|
11
|
|
12 printif = function(string = NULL, condition){
|
|
13 if (condition) {
|
|
14 print(string)
|
|
15 }
|
|
16 }
|
|
17
|
|
18 findPlasmids = function(plasmidPSLFile = NULL, plasmidDatabase = NULL,
|
|
19 amrPSLFile = NULL, amrDatabase, noAMR = FALSE,
|
|
20 incPSLFile = NULL, incDatabase, noInc = FALSE,
|
|
21 outputDirectory = NA, overwrite = TRUE,
|
|
22 maxTargetLength = 300000,
|
|
23 minQueryLength = 500,
|
|
24 makeCircos = FALSE,
|
|
25 minQueryCoverage = 1/2, minTargetCoverage = 1/2,
|
|
26 searchDepth = NULL,
|
|
27 verbosity = 0) {
|
|
28
|
|
29 ## Verify the arguments
|
|
30 argumentsGood = TRUE
|
|
31 if (minQueryCoverage < .1 || minQueryCoverage > 1) {
|
|
32 argumentsGood = FALSE
|
|
33 message(paste('Minimum query coverage', minQueryCoverage, 'is outside of the range 0.1 <= x <= 1'))
|
|
34 }
|
|
35 if (minTargetCoverage < 0.02 || minTargetCoverage > 1) {
|
|
36 argumentsGood = FALSE
|
|
37 message(paste('Minimum target coverage', minTargetCoverage, 'is outside of the range 0.1 <= x <= 1'))
|
|
38 }
|
|
39 if (!argumentsGood){
|
|
40 message('There is a problem with the arguments')
|
|
41 return()
|
|
42 }
|
|
43
|
|
44 printif(paste('Finding plasmids in', plasmidPSLFile), verbosity > 0)
|
|
45
|
|
46 ## Keep track of the total score in case we doing a grid search
|
|
47 totalPlasmidScore = 0
|
|
48
|
|
49 ## Check for the existence of the output directory, remove if it exists
|
|
50 if (file.exists(outputDirectory)) {
|
|
51 printif(paste('Removing existing output directory', outputDirectory), verbosity > 1)
|
|
52 unlink(outputDirectory, recursive = TRUE)
|
|
53 }
|
|
54 printif(paste('Making output directory', outputDirectory), verbosity > 1)
|
|
55 dir.create(outputDirectory)
|
|
56 outputPrefix = paste0(outputDirectory, "/plasmids")
|
|
57
|
|
58 ## Read in and filter the plasmid hits
|
|
59 plasmidHits = read.table(plasmidPSLFile, row.names = NULL, header = FALSE, sep = '\t', stringsAsFactors = FALSE, skip = 5)
|
|
60 colnames(plasmidHits) = c('match', 'mismatch', 'rep_m', 'Ns', 'tgap_c', 'tgap_b',
|
|
61 'qgap_c', 'qgap_b', 'strand',
|
|
62 'target', 'tlength', 'tstart', 'tstop',
|
|
63 'query', 'qlength', 'qstart', 'qstop',
|
|
64 'blocks', 'block_sizes', 'tstarts', 'qstarts')
|
|
65 printif(paste("Sequence-plasmid hits:", nrow(plasmidHits)), verbosity > 0)
|
|
66
|
|
67 plasmidHits = plasmidHits[order(plasmidHits[,'target'], -plasmidHits[,'qlength']), ]
|
|
68
|
|
69 ## Toss out any hits missing information
|
|
70 plasmidHits = plasmidHits[complete.cases(plasmidHits),]
|
|
71
|
|
72 ## Toss out very long plasmid sequences -- probably actually genome chunks labeled incorrectly
|
|
73 veryLongHits = sum(plasmidHits[,'tlength'] >= maxTargetLength)
|
|
74 printif(paste('Removing', veryLongHits, 'hits greater than', maxTargetLength), verbosity > 0)
|
|
75 plasmidHits = plasmidHits[plasmidHits[,'tlength'] <= maxTargetLength, ]
|
|
76 printif(paste("Sequence-plasmid hits after removing very long plasmids:", nrow(plasmidHits)), verbosity > 0)
|
|
77
|
|
78 ## Toss out very short query sequences -- probably junk or repeats
|
|
79 veryShortQuery = sum(plasmidHits[,'qlength'] >= minQueryLength)
|
|
80 printif(paste('Removing', veryShortQuery, 'queries less than', minQueryLength), verbosity > 0)
|
|
81 plasmidHits = plasmidHits[plasmidHits[,'qlength'] >= minQueryLength, ]
|
|
82 printif(paste("Sequence-plasmid hits after removing very short queries:", nrow(plasmidHits)), verbosity > 0)
|
|
83
|
|
84 ## Toss out sequece-plasmid pairs below the coverage cutoff
|
|
85 sequenceMatches = aggregate(x = plasmidHits[,'match',drop = FALSE],
|
|
86 by = list(plasmidHits[,'query'], plasmidHits[,'target']), FUN = sum)
|
|
87 printif(head(sequenceMatches), verbosity > 1)
|
|
88 printif(paste('Sequence-plasmid pair matches:', paste(dim(sequenceMatches), collapse = 'x')), verbosity > 1)
|
|
89
|
|
90 sequenceLengths = aggregate(x = plasmidHits[,'qlength', drop = FALSE],
|
|
91 by = list(plasmidHits[,'query'], plasmidHits[,'target']), FUN = max)
|
|
92 printif(head(sequenceLengths), verbosity > 1)
|
|
93 printif(paste('Sequence-plasmid pair lengths:', paste(dim(sequenceLengths), collapse = 'x')), verbosity > 1)
|
|
94
|
|
95 matchingFractions = cbind(sequenceMatches[,c(1,2)], sequenceMatches[,3] / sequenceLengths[,3])
|
|
96 colnames(matchingFractions) = c('query', 'target', 'fraction')
|
|
97 printif(head(matchingFractions), verbosity > 1)
|
|
98 printif(paste('Sequence-plasmid pair fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1)
|
|
99
|
|
100 matchingFractions = matchingFractions[matchingFractions[,'fraction'] >= minQueryCoverage,]
|
|
101 printif(head(matchingFractions), verbosity > 1)
|
|
102 printif(paste('Passing sequence-plasmid pair fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1)
|
|
103
|
|
104 aboveMinCoverage = apply(matchingFractions, 1, function(i){paste0(i['query'], '|', i['target'])})
|
|
105 plasmidHits = plasmidHits[apply(plasmidHits, 1, function(i){paste0(i['query'], '|', i['target'])}) %in% aboveMinCoverage, ]
|
|
106 printif(paste("Sequence-plasmid hits after removing low-coverage hits:", nrow(plasmidHits)), verbosity > 0)
|
|
107
|
|
108 ## Toss out plasmid sequences below the coverage cutoff
|
|
109 targetMatches = aggregate(x = plasmidHits[,'match',drop = FALSE],
|
|
110 by = list(plasmidHits[,'target']), FUN = sum)
|
|
111 printif(head(targetMatches), verbosity > 1)
|
|
112 printif(paste('Plasmid matches:', paste(dim(targetMatches), collapse = 'x')), verbosity > 1)
|
|
113
|
|
114 targetLengths = aggregate(x = plasmidHits[,'tlength', drop = FALSE],
|
|
115 by = list(plasmidHits[,'target']), FUN = max)
|
|
116 printif(head(targetLengths), verbosity > 1)
|
|
117 printif(paste('Plasmid lengths:', paste(dim(targetLengths), collapse = 'x')), verbosity > 1)
|
|
118
|
|
119 matchingFractions = cbind(targetMatches[,1], targetMatches[,2] / targetLengths[,2])
|
|
120 colnames(matchingFractions) = c('target', 'fraction')
|
|
121 printif(head(matchingFractions), verbosity > 1)
|
|
122 printif(paste('Plasmid fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1)
|
|
123
|
|
124 matchingFractions = matchingFractions[matchingFractions[,'fraction'] >= minTargetCoverage,]
|
|
125 printif(head(matchingFractions), verbosity > 1)
|
|
126 printif(paste('Passing plasmid fractions:', paste(dim(matchingFractions), collapse = 'x')), verbosity > 1)
|
|
127
|
|
128 aboveMinCoverage = matchingFractions[, 'target']
|
|
129 plasmidHits = plasmidHits[plasmidHits[, 'target'] %in% aboveMinCoverage, ]
|
|
130 printif(paste("Sequence-plasmid hits after removing low-coverage hits:", nrow(plasmidHits)), verbosity > 0)
|
|
131
|
|
132
|
|
133 ## If we're out of sequece-plasmid hits, then stop here
|
|
134 if (nrow(plasmidHits) == 0) {
|
|
135 message(paste('Not hits found'))
|
|
136 return
|
|
137 }
|
|
138
|
|
139 ## Find out how much of each query (contig) is covered by each target (plasmid).
|
|
140 ## Query coverage is constant and does not change as we assign contigs to plasmids
|
|
141 queryCoverage = hash()
|
|
142 queryMismatches = hash()
|
|
143 for (i in 1:nrow(plasmidHits)) {
|
|
144 if (!(i %% 1000)) {
|
|
145 printif(paste('Processing hit', i, '/', nrow(plasmidHits)), verbosity > 0)
|
|
146 }
|
|
147
|
|
148 query = plasmidHits[i,'query']
|
|
149 target = plasmidHits[i, 'target']
|
|
150
|
|
151 ## Represent each sequence-plasmid hit as a series of 0/1 vectors that
|
|
152 if (!has.key(query, queryCoverage)) {
|
|
153 queryCoverage[[query]] = hash()
|
|
154 queryMismatches[[query]] = hash()
|
|
155 }
|
|
156 if (!has.key(target, queryCoverage[[query]])) {
|
|
157 queryCoverage[[query]][[target]] = rep(0, times = plasmidHits[i, 'qlength'])
|
|
158 queryMismatches[[query]][[target]] = 0
|
|
159 }
|
|
160
|
|
161 blockSizes = as.numeric(unlist(strsplit(x = plasmidHits[i,'block_sizes'], ',')))
|
|
162 qBlockStarts = as.numeric(unlist(strsplit(x = plasmidHits[i,'qstarts'], ',')))
|
|
163
|
|
164 for (j in 1:length(blockSizes)) {
|
|
165 queryCoverage[[query]][[target]][qBlockStarts[j]:(qBlockStarts[j]+blockSizes[j])] = 1
|
|
166 }
|
|
167 queryMismatches[[query]][[target]] = queryMismatches[[query]][[target]] + plasmidHits[i,'mismatch']
|
|
168 }
|
|
169
|
|
170
|
|
171 ## Pull the full plasmid names from the blast database because BLAT/minimap2 doesn't report them, just the ID's
|
|
172 targetIDs = plasmidHits[,'target']
|
|
173 targetIDs = gsub("\\|$", "", targetIDs)
|
|
174 targetIDs = gsub(".*(\\|.*)$", "\\1", targetIDs)
|
|
175 noDotIDs = gsub("\\|", "", targetIDs)
|
|
176 noDotIDs = gsub("(^H[^.]+).[0-9]+$", "\\1", noDotIDs)
|
|
177 noDotIDs = cbind(noDotIDs)
|
|
178
|
|
179 targetFile = paste0(outputDirectory, '/targets.tsv', sep = '')
|
|
180 write.table(file = targetFile, x = noDotIDs, quote = FALSE, row.names = FALSE, col.names = FALSE)
|
|
181 command = paste('blastdbcmd -db', plasmidDatabase,
|
|
182 '-entry_batch', targetFile,
|
|
183 '| grep ">"')
|
|
184 targetNames = system(command, intern = TRUE)
|
|
185 printif(paste('Found', length(targetNames), 'target names for', length(targetIDs), 'targets.'), verbosity > 0)
|
|
186
|
|
187 targetNames = gsub('^>.*\\| ', '', targetNames)
|
|
188 targetNames = gsub('^>[^ ]+', '', targetNames)
|
|
189 plasmidHits = cbind(plasmidHits, targetIDs, targetNames)
|
|
190 printif(paste('Named hits:', nrow(plasmidHits)), verbosity > 1)
|
|
191
|
|
192 #Pull just the plasmids out of the larget set of hits, i.e, make sure it has the word 'plasmid' in the description.
|
|
193 plasmidHits = plasmidHits[grep('plasmid|vector', plasmidHits[,'targetNames'], ignore.case = TRUE), ,drop = FALSE]
|
|
194 plasmidHits = plasmidHits[!grepl('tig0000|unnamed', plasmidHits[,'targetNames'], ignore.case = TRUE), , drop = FALSE]
|
|
195 printif(paste("Sequece-plasmid hits after screening by name:", paste(dim(plasmidHits), collapse = 'x')), verbosity > 1)
|
|
196
|
|
197 ## Stop if there is nothing left
|
|
198 if (is.null(plasmidHits)) {
|
|
199 message('Not hits found')
|
|
200 return()
|
|
201 }
|
|
202 if (nrow(plasmidHits) == 0) {
|
|
203 message('Not hits found')
|
|
204 return()
|
|
205 }
|
|
206
|
|
207 ## Clean up the plasmid names -- they look like crap by default.
|
|
208 plasmidNames = plasmidHits[,'targetNames']
|
|
209 plasmidNames = gsub(', comp.*', '', plasmidNames)
|
|
210 plasmidNames = gsub(', contig.*', '', plasmidNames)
|
|
211 plasmidNames = gsub(', partial.*', '', plasmidNames)
|
|
212 plasmidNames = gsub('strain ', '', plasmidNames)
|
|
213 plasmidNames = gsub('^ *', '', plasmidNames)
|
|
214 plasmidNames = sub('^(cl\\|)(.*?) ', '', plasmidNames)
|
|
215 plasmidNames = sub('subsp. (.*?) ', '', plasmidNames)
|
|
216 plasmidNames = sub('serovar (.*?) ', '', plasmidNames)
|
|
217 plasmidNames = sub('strain ', '', plasmidNames)
|
|
218 plasmidNames = sub('plasmid$', '', plasmidNames)
|
|
219 plasmidHits[,'targetNames'] = plasmidNames
|
|
220
|
|
221 ## Just take the best hit for each plasmid, hence the head, 1 in the agg
|
|
222 plasmidNames = aggregate(plasmidHits, by = list(plasmidHits[,'query']), FUN = head, 1)
|
|
223 plasmidNames = plasmidNames[, 'targetNames', drop = FALSE]
|
|
224
|
|
225 ## Find the set of plasmid coverage hits for each itteration
|
|
226 usedContigs = c()
|
|
227 plasmidMismatches = c()
|
|
228
|
|
229 ## Order hits by the plasmid ID and the query length
|
|
230 plasmidHits = plasmidHits[order(plasmidHits[,'target'], -plasmidHits[,'qlength']), ]
|
|
231
|
|
232 ## Iterate, finding plasmids until we run out of usable sequence-plasmid its
|
|
233 plasmidNumber = 0
|
|
234 plasmidResults = c()
|
|
235 while (1) {
|
|
236
|
|
237 ## Keep track of how many plasmids we have gone over
|
|
238 plasmidNumber = plasmidNumber + 1
|
|
239
|
|
240 printif(paste('Sequence-plasmid hits left:', nrow(plasmidHits)), verbosity > 1)
|
|
241 contigToPlasmid = hash()
|
|
242 plasmidToContig = hash()
|
|
243 plasmidCoverage = hash()
|
|
244 plasmidCoverageWithRepeats = hash()
|
|
245 contigCoverage = hash()
|
|
246
|
|
247 ##Find contig/plasmid plasmid/contig pairs
|
|
248 if (is.null(plasmidHits)) {
|
|
249 break
|
|
250 }
|
|
251 if (nrow(plasmidHits) == 0) {
|
|
252 break
|
|
253 }
|
|
254 repLengths = c()
|
|
255
|
|
256
|
|
257 ## Find the coverage of each plasmid in the possible set by the contigs
|
|
258 for (i in 1:nrow(plasmidHits)) {
|
|
259
|
|
260 query = plasmidHits[i,'query']
|
|
261 target = plasmidHits[i,'target']
|
|
262 matches = plasmidHits[i,'match']
|
|
263 mismatches = plasmidHits[i,'mismatch']
|
|
264 score = matches - mismatches
|
|
265 queryLength = plasmidHits[i,'qlength']
|
|
266 blockSizes = as.numeric(unlist(strsplit(plasmidHits[i, 'block_sizes'], ',')))
|
|
267 queryStarts = as.numeric(unlist(strsplit(plasmidHits[i, 'qstarts'], ',')))
|
|
268 targetStarts = as.numeric(unlist(strsplit(plasmidHits[i, 'tstarts'], ',')))
|
|
269
|
|
270 ## Skip matches which have less than 50% of the bases from the contig on the plasmid -- probably not a good match
|
|
271 if ((sum(queryCoverage[[query]][[target]]) - queryMismatches[[query]][[target]]) / queryLength <= minQueryCoverage) {
|
|
272 next
|
|
273 }
|
|
274
|
|
275 targetLength = plasmidHits[i, 'tlength']; targetStart = plasmidHits[i, 'tstart']; targetStop = plasmidHits[i, 'tstop']
|
|
276
|
|
277 ## Relate this contig to this plasmid
|
|
278 if (!has.key(query, contigToPlasmid)) {
|
|
279 contigToPlasmid[[query]] = hash()
|
|
280 contigCoverage[[query]] = hash()
|
|
281 }
|
|
282 if (!has.key(target, contigToPlasmid[[query]])) {
|
|
283 contigToPlasmid[[query]][[target]] = score
|
|
284 contigCoverage[[query]][[target]] = rep(0, queryLength)
|
|
285 } else {
|
|
286 contigToPlasmid[[query]][[target]] = contigToPlasmid[[query]][[target]] + score
|
|
287 }
|
|
288
|
|
289 ## Keep track of target(plasmid) coverage by the contigs
|
|
290 if (!has.key(target, plasmidCoverage)) {
|
|
291 plasmidCoverage[[target]] = rep(0, targetLength)
|
|
292 plasmidCoverageWithRepeats[[target]] = rep(0, targetLength)
|
|
293 plasmidToContig[[target]] = hash()
|
|
294 plasmidMismatches[target] = 0
|
|
295 }
|
|
296
|
|
297 penalized = FALSE
|
|
298 for (j in 1:length(blockSizes)) {
|
|
299
|
|
300 ## Keep track of all contig alignments to this plasmid, even with repeats
|
|
301 plasmidCoverageWithRepeats[[target]][targetStarts[j]:(targetStarts[j] + blockSizes[j])] = 1
|
|
302
|
|
303 ## Skip if this region of the query sequence has already been assigned to this plasmid
|
|
304 if (sum(contigCoverage[[query]][[target]][queryStarts[j]:(queryStarts[j] + blockSizes[j])] == 0) <= 50) {
|
|
305 printif(paste('Sequence', query, 'already used for', target,
|
|
306 '. ', paste0(queryStarts[j], '-', queryStarts[j] + blockSizes[j])), verbosity > 2)
|
|
307 next
|
|
308 }
|
|
309 if (!penalized) {
|
|
310 ## Penalty for every gap, only penalize once per match
|
|
311 ## TODO: Penalize for gap length, not just once per gap
|
|
312 #plasmidMismatches[target] = plasmidMismatches[target] + (length(blockSizes) - 1) * 100
|
|
313 plasmidMismatches[target] = plasmidMismatches[target] + mismatches * 5
|
|
314 penalized = TRUE
|
|
315 }
|
|
316
|
|
317 plasmidCoverage[[target]][targetStarts[j]:(targetStarts[j] + blockSizes[j])][contigCoverage[[query]][[target]][queryStarts[j]:(queryStarts[j] + blockSizes[j])] == 0] =
|
|
318 plasmidCoverage[[target]][targetStarts[j]:(targetStarts[j] + blockSizes[j])][contigCoverage[[query]][[target]][queryStarts[j]:(queryStarts[j] + blockSizes[j])] == 0] + 1
|
|
319 contigCoverage[[query]][[target]][queryStarts[j]:(queryStarts[j] + blockSizes[j])] = 1
|
|
320 }
|
|
321
|
|
322 ## Relate this plasmid to this contig
|
|
323 if (!has.key(query, plasmidToContig[[target]])) {
|
|
324 plasmidToContig[[target]][[query]] = score
|
|
325 } else {
|
|
326 plasmidToContig[[target]][[query]] = plasmidToContig[[target]][[query]] + score
|
|
327 }
|
|
328 if (target == 'NZ_GG692894.1' && query == 'contig_3_0') {
|
|
329 print('NZ_GG692894.1')
|
|
330 print(sum(plasmidCoverage[[target]]))
|
|
331 print(sum(contigCoverage[[query]][[target]]))
|
|
332 }
|
|
333
|
|
334 }
|
|
335
|
|
336 ## Get the best set of plasmids out, i.e., the set with the most bases matching between the contig and plasmid
|
|
337 plasmidScores = c()
|
|
338 for (thisPlasmid in keys(plasmidCoverage)){
|
|
339 thisPlasmidScore = sum(plasmidCoverage[[thisPlasmid]])
|
|
340 plasmidScores = c(plasmidScores, thisPlasmidScore)
|
|
341 names(plasmidScores)[length(plasmidScores)] = thisPlasmid
|
|
342 }
|
|
343 plasmidScores = sort(plasmidScores - plasmidMismatches[names(plasmidScores)], dec = TRUE)
|
|
344
|
|
345 if (length(plasmidScores) > 0) {
|
|
346 printif('Highest scoring plasmids', verbosity > 1)
|
|
347 printif(head(cbind(plasmidScores), 20), verbosity > 1)
|
|
348 }
|
|
349
|
|
350 ## Stop searching for plasmids if nothing matches well or we're out of hits
|
|
351 if (length(plasmidScores) == 0) {
|
|
352 printif('Out of plasmids', verbosity > 0)
|
|
353 break
|
|
354 } else if (max(plasmidScores) < 500) {
|
|
355 printif('Out of min-scoring plasmids', verbosity > 0)
|
|
356 break
|
|
357 }
|
|
358
|
|
359 ## For each matching plasmid, ordered by total bases matching the assembly, find the set of corresponding contigs
|
|
360 plasmidToUse = 1
|
|
361 if (!is.null(searchDepth) && plasmidNumber <= length(searchDepth)) {
|
|
362 plasmidToUse = searchDepth[plasmidNumber]
|
|
363 }
|
|
364 if (plasmidToUse > length(plasmidScores)) {
|
|
365 plasmidToUse = length(plasmidScores)
|
|
366 }
|
|
367
|
|
368 plasmid = names(plasmidScores)[plasmidToUse]
|
|
369 print(paste('Plasmid picked', plasmid))
|
|
370 totalPlasmidScore = totalPlasmidScore + plasmidScores[plasmid]
|
|
371 printif(paste("Pulling sequences for", plasmid), verbosity > 0)
|
|
372
|
|
373 ## Find contigs what haven't already been given to another plasmid so we can assign them next round
|
|
374 plasmidContigs = keys(plasmidToContig[[plasmid]])
|
|
375 unusedContigs = plasmidContigs[!(plasmidContigs %in% usedContigs)]
|
|
376
|
|
377 ## If no unused contigs that also map to this plasmid then skip it
|
|
378 if (length(unusedContigs) == 0) {
|
|
379 printif(paste("No unused sequences for", plasmid), verbosity > 1)
|
|
380 next
|
|
381 }
|
|
382
|
|
383 ## Keep just the rows for this plasmid and which haven't already been used by another plasmid
|
|
384 plasmidRows = plasmidHits[plasmidHits[,'target'] == plasmid,]
|
|
385 plasmidRows = plasmidRows[plasmidRows[,'query'] %in% unusedContigs,,drop = FALSE]
|
|
386 plasmidName = plasmidRows[1, 'targetNames']
|
|
387 plasmidID = plasmidRows[1, 'targetIDs']
|
|
388 ## Get the plasmid length
|
|
389 command = paste('blastdbcmd -db', plasmidDatabase,
|
|
390 '-entry', plasmidID,
|
|
391 '-outfmt "%l"')
|
|
392 plasmidLength = system(command, intern = TRUE)
|
|
393 plasmidLength = rep(plasmidLength, nrow(plasmidRows))
|
|
394
|
|
395 ## How many bases from the plasmid are uncovered?
|
|
396 plasmidMissing = rep(sum(plasmidCoverageWithRepeats[[as.character(plasmidID)]] == 0), nrow(plasmidRows))
|
|
397
|
|
398 ## Keep track of all of the contigs included
|
|
399 usedContigs = c(usedContigs, unusedContigs)
|
|
400
|
|
401 ## How many matching bases for each query sequence?
|
|
402 thisPlasmidQuerySizes = c()
|
|
403 thisPlasmidMatches = c()
|
|
404 for (contig in unusedContigs) {
|
|
405 thisPlasmidQuerySizes = c(thisPlasmidQuerySizes, length(queryCoverage[[contig]][[plasmid]]))
|
|
406 thisPlasmidMatches = c(thisPlasmidMatches, sum(queryCoverage[[contig]][[plasmid]]))
|
|
407 }
|
|
408
|
|
409 ## Add this plasmid's hits onto the growing list of sequence-plasmid hits
|
|
410 thisPlasmidResults = cbind(unusedContigs, plasmidName, as.character(plasmidID), thisPlasmidQuerySizes, thisPlasmidMatches, plasmidLength, plasmidMissing)
|
|
411 colnames(thisPlasmidResults) = c('query.name', 'plasmid.name', 'plasmid.accession', 'query.size', 'aligned.bases', 'plasmid.size', 'plasmid.missing')
|
|
412 thisPlasmidResults = thisPlasmidResults[order(-thisPlasmidMatches), ]
|
|
413 plasmidResults = rbind(plasmidResults, thisPlasmidResults)
|
|
414
|
|
415 ## Remove the contigs added to this plasmid from the list of plasmid/contig BLAT hits
|
|
416 plasmidHits = plasmidHits[!(plasmidHits[,'query'] %in% usedContigs),]
|
|
417 plasmidHits = plasmidHits[!(plasmidHits[,'target'] == plasmid),]
|
|
418 }
|
|
419
|
|
420 rownames(plasmidResults) = plasmidResults[,1]
|
|
421
|
|
422 ## Check for the presence of AMR genes in this file
|
|
423 if (!noAMR) {
|
|
424 amrBEDFile = paste0(outputDirectory, '/amrMapping.bed')
|
|
425 command = paste('cat', amrPSLFile,
|
|
426 '| awk -F \'\\t\' \'($3 >= 80) && ($4 / $14 >= .95){OFS = "\t"; print $2,($9 < $10 ? $9 : $10),($9 < $10 ? $10 : $9),$1,$3/100,($9 < $10 ? "+" : "-")}\'',
|
|
427 '| sort -k 1,1 -k 2,2n >', amrBEDFile)
|
|
428 printif(command, verbosity > 1)
|
|
429 system(command)
|
|
430 ## Find local overlapping regions
|
|
431 amrMergedBEDFile = paste0(outputDirectory, '/amrMergedMapping.bed')
|
|
432 command = paste('bedtools merge -d -30 -i', amrBEDFile, '>', amrMergedBEDFile)
|
|
433 printif(command, verbosity > 1)
|
|
434 system(command)
|
|
435 ## Find the best AMR gene for each region
|
|
436 amrFinalBEDFile = paste0(outputDirectory, '/amrFinal.bed')
|
|
437 command = paste('bedtools intersect',
|
|
438 '-a', amrBEDFile,
|
|
439 '-b', amrMergedBEDFile,
|
|
440 '-f .9 -F .9',
|
|
441 '-wao',
|
|
442 '| awk \'$7 != "."\'',
|
|
443 '| awk \'{OFS="\t";locus=$7"\t"$8"\t"$9; if($5 > s[locus]){s[locus]=$5;b[locus] = $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}}END{for(i in b){print i,b[i]}}\'',
|
|
444 '>', amrFinalBEDFile)
|
|
445 printif(command, verbosity > 1)
|
|
446 system(command)
|
|
447
|
|
448 ## Read the AMR results in and add them to the plasmid contigs
|
|
449 amrResults = read.table(file = amrFinalBEDFile, header = FALSE, row.names = NULL, stringsAsFactors = FALSE, quote = '')
|
|
450 amrResults[,7] = gsub('(_.*$)|(.*\\|)', '', amrResults[,7])
|
|
451 amrResults = aggregate(amrResults[ , 7, drop = FALSE], by = list(amrResults[,1]), function(i){paste(i, collapse = ', ')})
|
|
452 rownames(amrResults) = amrResults[,1]
|
|
453 amrResults = amrResults[ , 2, drop = FALSE]
|
|
454 print(amrResults)
|
|
455
|
|
456 plasmidResults = cbind(plasmidResults, rep('', nrow(plasmidResults)))
|
|
457 colnames(plasmidResults)[ncol(plasmidResults)] = 'amr'
|
|
458 plasmidResults[rownames(plasmidResults) %in% rownames(amrResults), 'amr'] =
|
|
459 amrResults[rownames(plasmidResults)[rownames(plasmidResults) %in% rownames(amrResults)], 1]
|
|
460
|
|
461 }
|
|
462
|
|
463 ## Check for the presence of incompatibility groups in this file
|
|
464 if (!noInc) {
|
|
465 incBEDFile = paste0(outputDirectory, '/incMapping.bed')
|
|
466 command = paste('cat', incPSLFile,
|
|
467 '| awk -F \'\\t\' \'($3 >= 80) && ($4 / $14 >= .95){OFS = "\t"; print $2,($9 < $10 ? $9 : $10),($9 < $10 ? $10 : $9),$1,$3/100,($9 < $10 ? "+" : "-")}\'',
|
|
468 '| sort -k 1,1 -k 2,2n >', incBEDFile)
|
|
469 printif(command, verbosity > 1)
|
|
470 system(command)
|
|
471 ## Find local overlapping regions
|
|
472 incMergedBEDFile = paste0(outputDirectory, '/incMergedMapping.bed')
|
|
473 command = paste('bedtools merge -d -30 -i', incBEDFile, '>', incMergedBEDFile)
|
|
474 printif(command, verbosity > 1)
|
|
475 system(command)
|
|
476 ## Find the best INC group for each region
|
|
477 incFinalBEDFile = paste0(outputDirectory, '/incFinal.bed')
|
|
478 command = paste('bedtools intersect',
|
|
479 '-a', incBEDFile,
|
|
480 '-b', incMergedBEDFile,
|
|
481 '-f .9 -F .9',
|
|
482 '-wao',
|
|
483 '| awk \'$7 != "."\'',
|
|
484 '| awk \'{OFS="\t";locus=$7"\t"$8"\t"$9; if($5 > s[locus]){s[locus]=$5;b[locus] = $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6}}END{for(i in b){print i,b[i]}}\'',
|
|
485 '>', incFinalBEDFile)
|
|
486 printif(command, verbosity > 1)
|
|
487 system(command)
|
|
488
|
|
489 ## Read the inc group results in and add them to the plasmid contigs
|
|
490 incResults = read.table(file = incFinalBEDFile, header = FALSE, row.names = NULL, stringsAsFactors = FALSE, quote = '')
|
|
491 incResults[,7] = gsub('(_.*$)|(.*\\|)', '', incResults[,7])
|
|
492 incResults = aggregate(incResults[ , 7, drop = FALSE], by = list(incResults[,1]), function(i){paste(i, collapse = ', ')})
|
|
493 rownames(incResults) = incResults[,1]
|
|
494 incResults = incResults[ , 2, drop = FALSE]
|
|
495 print(incResults)
|
|
496
|
|
497 plasmidResults = cbind(plasmidResults, rep('', nrow(plasmidResults)))
|
|
498 colnames(plasmidResults)[ncol(plasmidResults)] = 'inc'
|
|
499 plasmidResults[rownames(plasmidResults) %in% rownames(incResults), 'inc'] =
|
|
500 incResults[rownames(plasmidResults)[rownames(plasmidResults) %in% rownames(incResults)], 1]
|
|
501 }
|
|
502
|
|
503 ## Write the plasmid results to file
|
|
504 plasmidChunksFile = paste0(outputDirectory, '/plasmids.tsv')
|
|
505 write.table(file = plasmidChunksFile, x = plasmidResults, quote = FALSE, sep = '\t', row.names = FALSE, col.names = TRUE)
|
|
506
|
|
507 ## Dump a sequence file of potential plasmid contigs
|
|
508 plasmidSequenceFile = paste0(outputPrefix, '.fna')
|
|
509 system(paste0('echo "" >', plasmidSequenceFile))
|
|
510 for (contig in plasmidResults[,'query.name']) {
|
|
511 command = paste('faidx', paste0('assembly.fasta'), contig, '>>', plasmidSequenceFile)
|
|
512 print(command)
|
|
513 #system(command)
|
|
514 }
|
|
515
|
|
516
|
|
517 ## Return the total score of this round, in case we are doing a search
|
|
518 return(totalPlasmidScore)
|
|
519
|
|
520 }
|
|
521
|
|
522 pChunks = function(plasmidPSLFile = NULL, plasmidDatabase = NULL,
|
|
523 amrPSLFile = NULL, amrDatabase, noAMR = FALSE,
|
|
524 incPSLFile = NULL, incDatabase, noInc = FALSE,
|
|
525 outputDirectory = NA, overwrite = TRUE,
|
|
526 maxTargetLength = 300000,
|
|
527 minQueryLength = 200,
|
|
528 makeCircos = FALSE,
|
|
529 minQueryCoverage = 1/2, minTargetCoverage = 1/2,
|
|
530 searchDepth = c(1),
|
|
531 threads = 1,
|
|
532 verbosity = 2) {
|
|
533
|
|
534 print(plasmidDatabase)
|
|
535
|
|
536 ## Verify the arguments
|
|
537 argumentsGood = TRUE
|
|
538 if (!file.exists(plasmidPSLFile)) {
|
|
539 argumentsGood = FALSE
|
|
540 message(paste('Plasmid PSL file', plasmidPSLFile, 'not found'))
|
|
541 }
|
|
542 if (is.na(outputDirectory)) {
|
|
543 argumentsGood = FALSE
|
|
544 message('Output directory not given')
|
|
545 }
|
|
546 if (file.exists(outputDirectory) && !overwrite) {
|
|
547 argumentsGood = FALSE
|
|
548 message(paste('Output directory', outputDirectory, 'already exists. Add overwrite = TRUE'))
|
|
549 }
|
|
550 if (minQueryCoverage < .1 || minQueryCoverage > 1) {
|
|
551 argumentsGood = FALSE
|
|
552 message(paste('Minimum query coverage', minQueryCoverage, 'is outside of the range 0.1 <= x <= 1'))
|
|
553 }
|
|
554 if (minTargetCoverage < 0.02 || minTargetCoverage > 1) {
|
|
555 argumentsGood = FALSE
|
|
556 message(paste('Minimum target coverage', minTargetCoverage, 'is outside of the range 0.1 <= x <= 1'))
|
|
557 }
|
|
558 if (!argumentsGood){
|
|
559 message('There is a problem with the arguments')
|
|
560 return()
|
|
561 }
|
|
562
|
|
563 ## Check for the existence of the output directory, remove if it exists
|
|
564 if (file.exists(outputDirectory)) {
|
|
565 printif(paste('Removing existing output directory', outputDirectory), verbosity > 1)
|
|
566 unlink(outputDirectory, recursive = TRUE)
|
|
567 }
|
|
568 printif(paste('Making output directory', outputDirectory), verbosity > 1)
|
|
569 dir.create(outputDirectory)
|
|
570
|
|
571 ## Default to c(1) for the plasmid search depth
|
|
572 searchDepths = lapply(searchDepth, function(i){seq(i, 1)})
|
|
573 searchDepths = as.matrix(expand.grid(searchDepths))
|
|
574 print(searchDepths)
|
|
575
|
|
576 plasmidScores = mclapply(1:nrow(searchDepths), function(i) {
|
|
577 findPlasmids(plasmidPSLFile = plasmidPSLFile, plasmidDatabase = plasmidDatabase,
|
|
578 amrPSLFile = amrPSLFile, amrDatabase, noAMR = noAMR,
|
|
579 incPSLFile = incPSLFile, incDatabase, noInc = noInc,
|
|
580 outputDirectory = paste0(outputDirectory, '/plasmids_', paste(searchDepths[i,], collapse = '_')), overwrite = overwrite,
|
|
581 maxTargetLength = 300000,
|
|
582 minQueryLength = 200,
|
|
583 makeCircos = makeCircos,
|
|
584 minQueryCoverage = 1/2, minTargetCoverage = 1/2,
|
|
585 searchDepth = searchDepths[i,], ## Search depth i
|
|
586 verbosity = verbosity)
|
|
587 },
|
|
588 mc.cores = threads)
|
|
589 plasmidScores = unlist(plasmidScores)
|
|
590
|
|
591 print(cbind(searchDepths, plasmidScores))
|
|
592
|
|
593 ## Pick out the best set, penalizing for not taking the first
|
|
594 penalties = (unlist(apply(searchDepths, 1, sum)) - ncol(searchDepths)) * 2500
|
|
595 print(penalties)
|
|
596 plasmidScores = plasmidScores - penalties
|
|
597 print(cbind(searchDepths, plasmidScores))
|
|
598 bestScoring = which.max(plasmidScores)
|
|
599 bestScoringDirectory = paste0(outputDirectory, '/plasmids_', paste(searchDepths[bestScoring,], collapse = '_'))
|
|
600
|
|
601 ## Link to the best scoring files
|
|
602 files = c('plasmids.tsv', 'amrFinal.bed')
|
|
603 commands = paste('ln -s ', paste0('"', bestScoringDirectory, '/', files, '"'),
|
|
604 paste0(outputDirectory, '/', files))
|
|
605 printif(commands, verbosity >= 2)
|
|
606 lapply(commands, system)
|
|
607
|
|
608 }
|
|
609
|
|
610
|
|
611 optionList = list(
|
|
612 make_option('--plasmid_psl', type = 'character', default = NULL,
|
|
613 help = 'Plasmid PSL database-v-contig output', metavar = '<PSL_FILE>'),
|
|
614 make_option('--plasmid_database', type = 'character', default = NULL,
|
|
615 help = 'Plasmid database', metavar = '<PLASMID_FASTA>'),
|
|
616 make_option('--amr_database', type = 'character', default = NULL,
|
|
617 help = 'AMR database', metavar = '<AMR_FASTA>'),
|
|
618 make_option('--amr_blast', type = 'character', default = NULL,
|
|
619 help = 'AMR blast output', metavar = '<BLAST_6>'),
|
|
620 make_option('--output', type = 'character', default = NULL,
|
|
621 help = 'Output dir', metavar = '<OUTPUT_DIR>'),
|
|
622 make_option('--threads', type = 'numeric', default = NULL,
|
|
623 help = 'Output dir', metavar = '<OUTPUT_DIR>'),
|
|
624 make_option('--no_amr', action = 'store_true', default = FALSE,
|
|
625 help = 'Don\'t run AMR'),
|
|
626 make_option('--no_inc', action = 'store_true', default = FALSE,
|
|
627 help = 'Don\'t run incompatibility groups')
|
|
628 )
|
|
629
|
|
630 optParser = OptionParser(option_list = optionList)
|
|
631 opt = parse_args(optParser)
|
|
632 print(opt)
|
|
633 pChunks(plasmidPSLFile = opt$'plasmid_psl', plasmidDatabase = opt$'plasmid_database',
|
|
634 amrPSLFile = opt$'amr_blast', amrDatabase = opt$'amr_database',
|
|
635 outputDirectory = opt$output,
|
|
636 threads = opt$threads,
|
|
637 # searchDepth = c(1,1),
|
|
638 searchDepth = c(5,5),
|
|
639 noAMR = TRUE, noInc = TRUE,
|
|
640 verbosity = 2)
|
|
641
|
|
642
|
|
643
|