Mercurial > repos > mmonot > phageterm
comparison PhageTerm.py @ 24:c8f88ae512f3 draft default tip
Uploaded
author | mmonot |
---|---|
date | Tue, 17 Sep 2024 13:35:16 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
23:26ceb3225190 | 24:c8f88ae512f3 |
---|---|
1 #! /usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # This file is a part of PhageTerm software | |
5 # A tool to determine phage termini and packaging strategy | |
6 # and other useful informations using raw sequencing reads. | |
7 # (This programs works with sequencing reads from a randomly | |
8 # sheared DNA library preparations as Illumina TruSeq paired-end or similar) | |
9 # | |
10 # ---------------------------------------------------------------------- | |
11 # Copyright (C) 2017 Julian Garneau | |
12 # | |
13 # This program is free software; you can redistribute it and/or modify | |
14 # it under the terms of the GNU General Public License as published by | |
15 # the Free Software Foundation; either version 3 of the License, or | |
16 # (at your option) any later version. | |
17 # <http://www.gnu.org/licenses/gpl-3.0.html> | |
18 # | |
19 # This program is distributed in the hope that it will be useful, | |
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 # GNU General Public License for more details. | |
23 # ---------------------------------------------------------------------- | |
24 # | |
25 # @author Julian Garneau <julian.garneau@usherbrooke.ca> | |
26 # @author Marc Monot <marc.monot@pasteur.fr> | |
27 # @author David Bikard <david.bikard@pasteur.fr> | |
28 | |
29 | |
30 ### PYTHON Module | |
31 # Base | |
32 import os | |
33 import sys | |
34 from optparse import OptionParser, OptionGroup | |
35 | |
36 # Multiprocessing | |
37 import multiprocessing | |
38 from multiprocessing import Manager | |
39 import numpy as np | |
40 | |
41 # Project | |
42 from _modules.functions_PhageTerm import * | |
43 | |
44 ### MAIN | |
45 # Option | |
46 usage = """\n\nUsage: %prog -f reads.fastq -r phage_sequence.fasta [-n phage_name -p reads_paired -s seed_lenght -d surrounding -t installation_test -c nbr_core -g host.fasta (warning increase process time)] | |
47 | |
48 Program: PhageTerm - Analyze phage termini and packaging mode using reads from high-throughput sequenced phage data | |
49 Version: 1.0.11 | |
50 Contact: Julian Garneau <julian.garneau@usherbrooke.ca> | |
51 Contact: David Bikard <david.bikard@pasteur.fr> | |
52 Contact: Marc Monot <marc.monot@pasteur.fr> | |
53 | |
54 You can perform a program test run upon installation using the "-t " option. | |
55 Arguments for the -t option can be : 5, 3, DS, DL, M or H. | |
56 | |
57 Example of test commands : | |
58 PhageTerm.py.py -t C5 -> Test run for a 5\' cohesive end (e.g. Lambda) | |
59 PhageTerm.py.py -t C3 -> Test run for a 3\' cohesive end (e.g. HK97) | |
60 PhageTerm.py.py -t DS -> Test run for a Direct Terminal Repeats end short (e.g. T7) | |
61 PhageTerm.py.py -t DL -> Test run for a Direct Terminal Repeats end long (e.g. T5) | |
62 PhageTerm.py.py -t H -> Test run for a Headful packaging (e.g. P1) | |
63 PhageTerm.py.py -t M -> Test run for a Mu-like packaging (e.g. Mu) | |
64 """ | |
65 | |
66 getopt = OptionParser(usage=usage) | |
67 | |
68 optreads = OptionGroup(getopt, 'Raw reads file in fastq format') | |
69 optreads.add_option('-f', '--fastq', dest='fastq', metavar='FILE', help='Fastq reads from Illumina TruSeq') | |
70 getopt.add_option_group(optreads) | |
71 | |
72 optref = OptionGroup(getopt, 'Phage genome in fasta format') | |
73 optref.add_option('-r', '--ref', dest='reference', metavar='FILE', help='Reference phage genome as unique contig in fasta format') | |
74 getopt.add_option_group(optref) | |
75 | |
76 optname = OptionGroup(getopt, 'Name of the phage being analyzed by the user') | |
77 optname.add_option('-n', '--phagename', dest='phagename', metavar='STRING', help='Manually enter the name of the phage being analyzed. Used as prefix for output files.') | |
78 getopt.add_option_group(optname) | |
79 | |
80 optseed = OptionGroup(getopt, 'Lenght of the seed used for reads in the mapping process') | |
81 optseed.add_option('-s', '--seed', dest='seed', metavar='INT', type="int", help='Manually enter the lenght of the seed used for reads in the mapping process.') | |
82 getopt.add_option_group(optseed) | |
83 | |
84 optsurround = OptionGroup(getopt, 'Lenght of the surrounding region considered for peak value cumulation') | |
85 optsurround.add_option('-d', '--surrounding', dest='surround', type="int", metavar='INT', help='Manually enter the lenght of the surrounding used to merge very close peaks in the analysis process.') | |
86 getopt.add_option_group(optsurround) | |
87 | |
88 optcore = OptionGroup(getopt, 'Number of core processors to use (Default: 1)') | |
89 optcore.add_option('-c', '--core', dest='core', metavar='INT', type="int", help='Manually enter the number of core you want to use.') | |
90 getopt.add_option_group(optcore) | |
91 | |
92 opthost = OptionGroup(getopt, 'Host genome in fasta format') | |
93 opthost.add_option('-g', '--host', dest='host', metavar='FILE', help='Reference host genome as unique contig in fasta format') | |
94 getopt.add_option_group(opthost) | |
95 | |
96 optpaired = OptionGroup(getopt, 'Use paired-end reads') | |
97 optpaired.add_option('-p', '--paired', dest='paired', metavar='FILE', help='Use paired-end reads to calculate real insert coverage') | |
98 getopt.add_option_group(optpaired) | |
99 | |
100 optmean = OptionGroup(getopt, 'Defined phage mean coverage') | |
101 optmean.add_option('-m', '--mean', dest='mean', metavar='INT', type="int", help='Defined phage mean coverage') | |
102 getopt.add_option_group(optmean) | |
103 | |
104 opttest = OptionGroup(getopt, 'Perform a program test run upon installation') | |
105 opttest.add_option('-t', '--test', dest='test', metavar='STRING', help='Perform a program test run upon installation. If you want to perform a test run, use the "-t " option. Arguments for the -t option can be : C5, C3, DS, DL, H or M. C5 -> Test run for a 5\' cohesive end (e.g. Lambda); C3 -> Test run for a 3\' cohesive end (e.g. HK97); DS -> Test run for a short Direct Terminal Repeats end (e.g. T7); DL -> Test run for a long Direct Terminal Repeats end (e.g. T5); H -> Test run for a Headful packaging (e.g. P1); M -> Test run for a Mu-like packaging (e.g. Mu)') | |
106 getopt.add_option_group(opttest) | |
107 | |
108 | |
109 ###### | |
110 | |
111 options, arguments = getopt.parse_args() | |
112 fastq = options.fastq | |
113 reference = options.reference | |
114 phagename = options.phagename | |
115 seed = options.seed | |
116 surrounding = options.surround | |
117 core = options.core | |
118 host = options.host | |
119 paired = options.paired | |
120 mean = options.mean | |
121 test = options.test | |
122 | |
123 ###### | |
124 | |
125 if options.fastq == None and options.test == None: | |
126 getopt.error('\tNo reads file provided.\n\t\t\tUse -h or --help for more details\n') | |
127 | |
128 if options.reference == None and options.test == None: | |
129 getopt.error('\tNo fasta reference file provided.\n\t\t\tUse -h or --help for more details\n') | |
130 | |
131 if options.phagename == None and options.test == None: | |
132 phagename = "Phagename" | |
133 | |
134 if options.seed == None: | |
135 seed = 20 | |
136 | |
137 if options.surround == None: | |
138 surrounding = 20 | |
139 | |
140 if options.core == None: | |
141 core = 1 | |
142 | |
143 if options.host == None: | |
144 host = "" | |
145 | |
146 if options.paired == None: | |
147 paired = "" | |
148 | |
149 if options.mean == None: | |
150 mean = 250 | |
151 | |
152 ###### | |
153 | |
154 if options.test == None: | |
155 test_run = 0 | |
156 else: | |
157 test_run = 1 | |
158 | |
159 | |
160 if options.test == "C5": | |
161 print "\nPerforming a test run using test phage sequence with 5 prime cohesive overhang :" | |
162 print "\npython PhageTerm.py -f test-data/COS-5.fastq -r test-data/COS-5.fasta -n TEST_cohesive_5_prime" | |
163 fastq = "test-data/COS-5.fastq" | |
164 reference = "test-data/COS-5.fasta" | |
165 phagename = "Test-cohesive-5'" | |
166 | |
167 | |
168 elif options.test == "C3": | |
169 print "\nPerforming a test run using test phage sequence with 3 prime cohesive overhang:" | |
170 print "\npython PhageTerm.py -f test-data/COS-3.fastq -r test-data/COS-3.fasta -n TEST_cohesive_3_prime" | |
171 fastq = "test-data/COS-3.fastq" | |
172 reference = "test-data/COS-3.fasta" | |
173 phagename = "Test-cohesive-3'" | |
174 | |
175 elif options.test == "DS": | |
176 print "\nPerforming a test run using test phage sequence with short direct terminal repeats (DTR-short) :" | |
177 print "\npython PhageTerm.py -f test-data/DTR-short.fastq -r test-data/DTR-short.fasta -n TEST_short_direct_terminal_repeats" | |
178 fastq = "test-data/DTR-short.fastq" | |
179 reference = "test-data/DTR-short.fasta" | |
180 phagename = "Test-short-direct-terminal-repeats" | |
181 | |
182 elif options.test == "DL": | |
183 print "\nPerforming a test run using test phage sequence with long direct terminal repeats (DTR-long) :" | |
184 print "\npython PhageTerm.py -f test-data/DTR-long.fastq -r test-data/DTR-long.fasta -n TEST_long_direct_terminal_repeats" | |
185 fastq = "test-data/DTR-long.fastq" | |
186 reference = "test-data/DTR-long.fasta" | |
187 phagename = "Test-long-direct-terminal-repeats" | |
188 | |
189 elif options.test == "H": | |
190 print "\nPerforming a test run using test phage sequence with headful packaging" | |
191 print "\npython PhageTerm.py -f test-data/Headful.fastq -r test-data/Headful.fasta -n TEST_headful" | |
192 fastq = "test-data/Headful.fastq" | |
193 reference = "test-data/Headful.fasta" | |
194 phagename = "Test-Headful" | |
195 surrounding = 0 | |
196 | |
197 elif options.test == "M": | |
198 print "\nPerforming a test run using test phage sequence with Mu-like packaging" | |
199 print "\npython PhageTerm.py -f test-data/Mu-like_R1.fastq -p test-data/Mu-like_R2.fastq -r test-data/Mu-like.fasta -n TEST_Mu-like -g test-data/Mu-like_host.fasta" | |
200 fastq = "test-data/Mu-like_R1.fastq" | |
201 paired = "test-data/Mu-like_R2.fastq" | |
202 reference = "test-data/Mu-like.fasta" | |
203 host = "test-data/Mu-like_host.fasta" | |
204 phagename = "Test-Mu-like" | |
205 surrounding = 0 | |
206 | |
207 ###### | |
208 | |
209 | |
210 # CHECK inputs | |
211 phagename = checkPhageName(phagename) | |
212 | |
213 if checkFastaFile(reference): | |
214 exit("ERROR in reference file") | |
215 | |
216 if host != "": | |
217 if checkFastaFile(host): | |
218 exit("ERROR in reference file") | |
219 | |
220 # VARIABLE | |
221 edge = 500 | |
222 insert_max = 1000 | |
223 limit_fixed = 35 | |
224 limit_preferred = 11 | |
225 limit_coverage = max(50,mean*2)/core | |
226 Mu_threshold = 0.5 | |
227 draw = 0 | |
228 if seed < 15: | |
229 seed = 15 | |
230 | |
231 # READS Number | |
232 tot_reads = totReads(fastq) | |
233 if paired != "": | |
234 tot_reads_paired = totReads(paired) | |
235 if (tot_reads != tot_reads_paired): | |
236 print "\nWARNING: Number of reads between the two reads files differ, using single reads only\n" | |
237 paired = "" | |
238 | |
239 # REFERENCE sequence recovery and edge adds | |
240 refseq = genomeFastaRecovery(reference) | |
241 refseq = refseq[-edge:] + refseq + refseq[:edge] | |
242 | |
243 # HOST sequence recovery | |
244 hostseq = genomeFastaRecovery(host) | |
245 if len(hostseq) != 0 and len(hostseq) < len(refseq): | |
246 print "\nHost length < Phage length : removing host sequence." | |
247 hostseq = "" | |
248 if hostseq != "": | |
249 hostseq = hostseq[-edge:] + hostseq + hostseq[:edge] | |
250 | |
251 | |
252 ### COVERAGE | |
253 print "\nCalculating coverage values, please wait (may take a while)...\n" | |
254 | |
255 if not test_run and core == 1: | |
256 print "If your computer has more than 1 processor, you can use the -c or --core option to speed up the process.\n\n" | |
257 | |
258 jobs = [] | |
259 manager = Manager() | |
260 return_dict = manager.dict() | |
261 | |
262 # Position in core split | |
263 file_split = int(tot_reads/core) | |
264 position = [] | |
265 | |
266 l = range(int(tot_reads)) | |
267 part = chunks(l, core) | |
268 for i in range(core): | |
269 position.append(part.next()[0]) | |
270 | |
271 position = position + [int(tot_reads)] | |
272 | |
273 for i in range(0, core): | |
274 process = multiprocessing.Process(target=readsCoverage, args=(fastq, refseq, hostseq, tot_reads, seed, edge, paired, insert_max, core, i, return_dict, position[i], position[i+1], limit_coverage)) | |
275 jobs.append(process) | |
276 | |
277 for j in jobs: | |
278 j.start() | |
279 | |
280 for j in jobs: | |
281 j.join() | |
282 | |
283 print "\n\nFinished calculating coverage values, the remainder should be completed rapidly\n" | |
284 | |
285 # merging results | |
286 for core_id in range(core): | |
287 if core_id == 0: | |
288 termini_coverage = return_dict[core_id][0] | |
289 whole_coverage = return_dict[core_id][1] | |
290 paired_whole_coverage = return_dict[core_id][2] | |
291 phage_hybrid_coverage = return_dict[core_id][3] | |
292 host_hybrid_coverage = return_dict[core_id][4] | |
293 host_whole_coverage = return_dict[core_id][5] | |
294 list_hybrid = return_dict[core_id][6] | |
295 insert = return_dict[core_id][7].tolist() | |
296 paired_missmatch = return_dict[core_id][8] | |
297 reads_tested = return_dict[core_id][9] | |
298 else: | |
299 termini_coverage += return_dict[core_id][0] | |
300 whole_coverage += return_dict[core_id][1] | |
301 paired_whole_coverage += return_dict[core_id][2] | |
302 phage_hybrid_coverage += return_dict[core_id][3] | |
303 host_hybrid_coverage += return_dict[core_id][4] | |
304 host_whole_coverage += return_dict[core_id][5] | |
305 list_hybrid += return_dict[core_id][6] | |
306 insert += return_dict[core_id][7].tolist() | |
307 paired_missmatch += return_dict[core_id][8] | |
308 reads_tested += return_dict[core_id][9] | |
309 | |
310 termini_coverage = termini_coverage.tolist() | |
311 whole_coverage = whole_coverage.tolist() | |
312 paired_whole_coverage = paired_whole_coverage.tolist() | |
313 phage_hybrid_coverage = phage_hybrid_coverage.tolist() | |
314 host_hybrid_coverage = host_hybrid_coverage.tolist() | |
315 host_whole_coverage = host_whole_coverage.tolist() | |
316 list_hybrid = list_hybrid.tolist() | |
317 | |
318 | |
319 # WHOLE Coverage : Average, Maximum and Minimum | |
320 added_whole_coverage, ave_whole_cov = wholeCov(whole_coverage, len(refseq)) | |
321 added_paired_whole_coverage, ave_paired_whole_cov = wholeCov(paired_whole_coverage, len(refseq)) | |
322 added_host_whole_coverage, ave_host_whole_cov = wholeCov(host_whole_coverage, len(hostseq)) | |
323 | |
324 drop_cov = testwholeCov(added_whole_coverage, ave_whole_cov, test_run) | |
325 | |
326 # NORM pic by whole coverage (1 base) | |
327 if paired != "": | |
328 paired_whole_coverage_test = maxPaired(paired_whole_coverage, whole_coverage) | |
329 termini_coverage_norm, mean_nc = normCov(termini_coverage, paired_whole_coverage, ave_whole_cov/1.5, edge) | |
330 else: | |
331 termini_coverage_norm, mean_nc = normCov(termini_coverage, whole_coverage, ave_whole_cov/1.5, edge) | |
332 | |
333 # REMOVE edge | |
334 termini_coverage[0] = RemoveEdge(termini_coverage[0],edge) | |
335 termini_coverage[1] = RemoveEdge(termini_coverage[1],edge) | |
336 termini_coverage_norm[0] = RemoveEdge(termini_coverage_norm[0],edge) | |
337 termini_coverage_norm[1] = RemoveEdge(termini_coverage_norm[1],edge) | |
338 whole_coverage[0] = RemoveEdge(whole_coverage[0],edge) | |
339 whole_coverage[1] = RemoveEdge(whole_coverage[1],edge) | |
340 paired_whole_coverage[0] = RemoveEdge(paired_whole_coverage[0],edge) | |
341 paired_whole_coverage[1] = RemoveEdge(paired_whole_coverage[1],edge) | |
342 added_whole_coverage = RemoveEdge(added_whole_coverage,edge) | |
343 added_paired_whole_coverage = RemoveEdge(added_paired_whole_coverage,edge) | |
344 added_host_whole_coverage = RemoveEdge(added_host_whole_coverage,edge) | |
345 phage_hybrid_coverage[0] = RemoveEdge(phage_hybrid_coverage[0],edge) | |
346 phage_hybrid_coverage[1] = RemoveEdge(phage_hybrid_coverage[1],edge) | |
347 host_whole_coverage[0] = RemoveEdge(host_whole_coverage[0],edge) | |
348 host_whole_coverage[1] = RemoveEdge(host_whole_coverage[1],edge) | |
349 host_hybrid_coverage[0] = RemoveEdge(host_hybrid_coverage[0],edge) | |
350 host_hybrid_coverage[1] = RemoveEdge(host_hybrid_coverage[1],edge) | |
351 refseq = RemoveEdge(refseq,edge) | |
352 if host != "": | |
353 hostseq = RemoveEdge(hostseq,edge) | |
354 gen_len = len(refseq) | |
355 host_len = len(hostseq) | |
356 if options.test == "DL": | |
357 gen_len = 100000 | |
358 | |
359 | |
360 # READS Total, Used and Lost | |
361 used_reads, lost_reads, lost_perc = usedReads(termini_coverage, reads_tested) | |
362 | |
363 # PIC Max | |
364 picMaxPlus, picMaxMinus, TopFreqH = picMax(termini_coverage, 5) | |
365 picMaxPlus_norm, picMaxMinus_norm, TopFreqH_norm = picMax(termini_coverage_norm, 5) | |
366 picMaxPlus_host, picMaxMinus_host, TopFreqH_host = picMax(host_whole_coverage, 5) | |
367 | |
368 ### ANALYSIS | |
369 | |
370 ## Close Peaks | |
371 picMaxPlus, picOUT_forw = RemoveClosePicMax(picMaxPlus, gen_len, surrounding) | |
372 picMaxMinus, picOUT_rev = RemoveClosePicMax(picMaxMinus, gen_len, surrounding) | |
373 picMaxPlus_norm, picOUT_norm_forw = RemoveClosePicMax(picMaxPlus_norm, gen_len, surrounding) | |
374 picMaxMinus_norm, picOUT_norm_rev = RemoveClosePicMax(picMaxMinus_norm, gen_len, surrounding) | |
375 | |
376 termini_coverage_close = termini_coverage[:] | |
377 termini_coverage_close[0], picOUT_forw = addClosePic(termini_coverage[0], picOUT_forw) | |
378 termini_coverage_close[1], picOUT_rev = addClosePic(termini_coverage[1], picOUT_rev) | |
379 | |
380 termini_coverage_norm_close = termini_coverage_norm[:] | |
381 termini_coverage_norm_close[0], picOUT_norm_forw = addClosePic(termini_coverage_norm[0], picOUT_norm_forw, 1) | |
382 termini_coverage_norm_close[1], picOUT_norm_rev = addClosePic(termini_coverage_norm[1], picOUT_norm_rev, 1) | |
383 | |
384 | |
385 ## Statistical Analysis | |
386 picMaxPlus_norm_close, picMaxMinus_norm_close, TopFreqH_norm = picMax(termini_coverage_norm_close, 5) | |
387 | |
388 if paired != "": | |
389 phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(paired_whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close) | |
390 else: | |
391 phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close) | |
392 | |
393 | |
394 ## LI Analysis | |
395 picMaxPlus_close, picMaxMinus_close, TopFreqH = picMax(termini_coverage_close, 5) | |
396 | |
397 R1, AveFreq = ratioR1(TopFreqH, used_reads, gen_len) | |
398 R2 = ratioR(picMaxPlus_close) | |
399 R3 = ratioR(picMaxMinus_close) | |
400 | |
401 ArtPackmode, termini, forward, reverse = packMode(R1, R2, R3) | |
402 ArtOrient = orientation(picMaxPlus_close, picMaxMinus_close) | |
403 ArtcohesiveSeq, ArtPackmode = sequenceCohesive(ArtPackmode, refseq, picMaxPlus_close, picMaxMinus_close, gen_len/2) | |
404 | |
405 | |
406 ### DECISION Process | |
407 | |
408 # PEAKS Significativity | |
409 plus_significant = selectSignificant(phage_plus_norm, 1.0/gen_len, limit_preferred) | |
410 minus_significant = selectSignificant(phage_minus_norm, 1.0/gen_len, limit_preferred) | |
411 | |
412 # DECISION | |
413 Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like = decisionProcess(plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid, used_reads, seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq) | |
414 | |
415 | |
416 ### EXPORT Data | |
417 | |
418 ## Statistics | |
419 ExportStatistics(phagename, whole_coverage, paired_whole_coverage, termini_coverage, phage_plus_norm, phage_minus_norm, paired, test_run) | |
420 | |
421 # Sequence | |
422 ExportCohesiveSeq(phagename, ArtcohesiveSeq, P_seqcoh, test_run) | |
423 ExportPhageSequence(phagename, P_left, P_right, refseq, P_orient, Redundant, Mu_like, P_class, P_seqcoh, test_run) | |
424 | |
425 # Report | |
426 CreateReport(phagename, seed, added_whole_coverage, draw, Redundant, P_left, P_right, Permuted, P_orient, termini_coverage_norm_close, picMaxPlus_norm_close, picMaxMinus_norm_close, gen_len, tot_reads, P_seqcoh, phage_plus_norm, phage_minus_norm, ArtPackmode, termini, forward, reverse, ArtOrient, ArtcohesiveSeq, termini_coverage_close, picMaxPlus_close, picMaxMinus_close, picOUT_norm_forw, picOUT_norm_rev, picOUT_forw, picOUT_rev, lost_perc, ave_whole_cov, R1, R2, R3, host, host_len, host_whole_coverage, picMaxPlus_host, picMaxMinus_host, surrounding, drop_cov, paired, insert, phage_hybrid_coverage, host_hybrid_coverage, added_paired_whole_coverage, Mu_like, test_run, P_class, P_type, P_concat) | |
427 |