annotate PhageTerm.py @ 1:ee73cdf35532 draft default tip

"planemo upload"
author bioit_sciensano
date Fri, 11 Mar 2022 16:02:03 +0000
parents 69e8f12c8b31
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
1 #! /usr/bin/env python
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
2 # -*- coding: utf-8 -*-
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
3 ##@file phageterm.py
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
4 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
5 # main program
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
6 ## PhageTerm software
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
7 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
8 # Phageterm is a tool to determine phage termini and packaging strategy
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
9 # and other useful informations using raw sequencing reads.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
10 # (This programs works with sequencing reads from a randomly
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
11 # sheared DNA library preparations as Illumina TruSeq paired-end or similar)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
12 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
13 # ----------------------------------------------------------------------
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
14 # Copyright (C) 2017 Julian Garneau
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
15 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
16 # This program is free software; you can redistribute it and/or modify
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
17 # it under the terms of the GNU General Public License as published by
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
18 # the Free Software Foundation; either version 3 of the License, or
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
19 # (at your option) any later version.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
20 # <http://www.gnu.org/licenses/gpl-3.0.html>
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
21 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
22 # This program is distributed in the hope that it will be useful,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
23 # but WITHOUT ANY WARRANTY; without even the implied warranty of
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
24 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
25 # GNU General Public License for more details.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
26 # ----------------------------------------------------------------------
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
27 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
28 # @author Julian Garneau <julian.garneau@usherbrooke.ca>
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
29 # @author Marc Monot <marc.monot@pasteur.fr>
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
30 # @author David Bikard <david.bikard@pasteur.fr>
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
31
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
32
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
33 ### PYTHON Module
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
34 # Base
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
35 #import sys
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
36
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
37
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
38 from __future__ import print_function
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
39
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
40 # Multiprocessing
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
41 import multiprocessing
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
42 import os
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
43 from multiprocessing import Manager
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
44
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
45
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
46 # Project
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
47
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
48 from _modules.utilities import checkReportTitle
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
49 from _modules.functions_PhageTerm import *
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
50 from _modules.common_readsCoverage_processing import processCovValuesForSeq
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
51 from _modules.main_utils import setOptions,checkOptArgsConsistency
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
52
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
53
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
54 ### MAIN
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
55 def main():
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
56
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
57 getopt=setOptions()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
58 inRawDArgs, fParms, tParms, inDArgs=checkOptArgsConsistency(getopt)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
59
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
60 # For each fasta in file
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
61 DR = {"Headful (pac)":{}, "COS (5')":{}, "COS (3')":{}, "COS":{}, "DTR (short)":{}, "DTR (long)":{}, "Mu-like":{}, "UNKNOWN":{}, "NEW":{}}
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
62 results_pos = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
63 no_match = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
64 draw = 0 # used when one wants to draw some graphs.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
65 chk_handler = RCCheckpoint_handler(tParms.chk_freq, tParms.dir_chk, tParms.test_mode)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
66 ## VL: keep this code just in case we want to try GPU implementation again later.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
67 # if tParms.gpu!=0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
68 # ref_data = refData(inDArgs.refseq_liste, fParms.seed, inDArgs.hostseq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
69 # nb_extracts=inRawDArgs.tot_reads
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
70 # if (inRawDArgs.paired!=""):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
71 # nb_extracts_per_read=7
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
72 # else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
73 # nb_extracts_per_read=4
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
74 # nb_extracts *= nb_extracts_per_read
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
75 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
76 # gpu_mapping_res_dir = tParms.gpu_mapping_res_dir
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
77 # wanted_gpu_nb_chunks = tParms.wanted_chunks
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
78 # mapper = GPU_chunkMapper()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
79 # mapper.setRefData(ref_data)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
80 # mapper.setFicDir(gpu_mapping_res_dir)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
81 # nb_kmer_in_chunk = nb_extracts//wanted_gpu_nb_chunks
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
82 # doMapping(nb_kmer_in_chunk, mapper, inRawDArgs.fastq, "", ref_data, nb_extracts_per_read)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
83 # if tParms.gpu_mapping_res_dir!=0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
84 # exit() # Consider that if we put results in files, it is because we are processing large datasets on a cluster. Otherwise, go on working.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
85 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
86 # if tParms.dir_cov_res!=None and tParms.gpu_mapping_res_dir!=None: # Process the mapping results produced by the GPU and put results in files
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
87 # if tParms.idx_chunk==None or tParms.idx_seq==None:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
88 # print "Indicate index of chunk and sequence to process"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
89 # exit(1)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
90 # seq_info = seqInfo(inDArgs.refseq_liste[tParms.idx_seq],tParms.idx_seq, inDArgs.hostseq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
91 # fname=os.path.join(tParms.gpu_mapping_res_dir,base_fname_rinfo+str(tParms.idx_chunk))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
92 # d_rinfo=load_d_rinfo(fname)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
93 # readsCoverageGPU_chunk(inRawDArgs.fastq, seq_info, tParms.idx_chunk, d_rinfo, fParms.edge, tParms.limit_coverage, fParms.virome, tParms.gpu_mapping_res_dir,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
94 # tParms.dir_cov_res, logger=None)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
95 # exit() # Consider that if we put results in files, it is because we are processing large datasets on a cluster.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
96
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
97 if tParms.multi_machine:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
98 print("Running on cluster")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
99 print(tParms.dir_cov_mm, tParms.seq_id, tParms.dir_seq_mm, tParms.DR_path)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
100 if tParms.dir_cov_mm!=None and tParms.gpu_mapping_res_dir==None and tParms.dir_seq_mm==None: # perform mapping and readCoverage calculation and write results in file.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
101 # In that case we are processing data in an embarrassingly parallel way on a cluster.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
102 position = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
103 read_indices = list(range(int(inRawDArgs.tot_reads)))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
104 part = chunks(read_indices, tParms.core)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
105 for i in range(tParms.core):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
106 position.append(next(part)[0])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
107
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
108 position = position + [int(inRawDArgs.tot_reads)]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
109 idx_refseq=chk_handler.getIdxSeq(tParms.core_id)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
110 print("starting processing at sequence: ",idx_refseq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
111 for refseq in inDArgs.refseq_liste[idx_refseq:]:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
112 readsCoverage(inRawDArgs, refseq, inDArgs, fParms,None,tParms.core_id, position[tParms.core_id], position[tParms.core_id + 1],
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
113 tParms,chk_handler,idx_refseq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
114 print("Processed: ", idx_refseq, " sequences")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
115 idx_refseq+=1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
116 if tParms.core_id==0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
117 fname=os.path.join(tParms.dir_cov_mm,"nb_seq_processed.txt")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
118 f=open(fname,"w")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
119 f.write(str(idx_refseq))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
120 f.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
121 exit() # Consider that if we put results in files, it is because we are processing large datasets on a cluster.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
122 if tParms.dir_cov_mm!=None and tParms.seq_id!=None and tParms.dir_seq_mm!=None and tParms.DR_path!=None:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
123 from _modules.seq_processing import sum_readsCoverage_for_seq
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
124 # in that case, we are processing all the results of readCoverage sequence by sequence in an embarrassingly parallel way on a cluster.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
125 sum_readsCoverage_for_seq(tParms.dir_cov_mm, tParms.seq_id, tParms.nb_pieces, inDArgs, fParms, inRawDArgs, tParms.dir_seq_mm,tParms.DR_path)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
126 exit()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
127 if tParms.dir_seq_mm!=None and tParms.dir_cov_mm==None and tParms.seq_id==None and tParms.DR_path!=None: # report generation
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
128 from _modules.generate_report import loadDR,genReport
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
129 loadDR(tParms.DR_path, DR)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
130 genReport(fParms, inDArgs, inRawDArgs, no_match, DR)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
131 exit()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
132 else: # mono machine original multi processing mode.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
133 ### COVERAGE
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
134 print("\nCalculating coverage values, please wait (may take a while)...\n")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
135 start_run = time.time()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
136
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
137 if not fParms.test_run and tParms.core == 1:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
138 print("If your computer has more than 1 processor, you can use the -c or --core option to speed up the process.\n\n")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
139
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
140
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
141 for refseq in inDArgs.refseq_liste:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
142 jobs = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
143 manager = Manager()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
144 return_dict = manager.dict()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
145 position = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
146
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
147 read_indices = list(range(int(inRawDArgs.tot_reads)))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
148 part = chunks(read_indices, tParms.core)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
149 for i in range(tParms.core):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
150 position.append(next(part)[0])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
151
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
152 position = position + [int(inRawDArgs.tot_reads)]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
153
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
154 for i in range(0, tParms.core):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
155 tParms.core_id=i
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
156 process = multiprocessing.Process(target=readsCoverage, args=(inRawDArgs, refseq, inDArgs, fParms,return_dict, i,position[i], position[i+1],
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
157 tParms, chk_handler,results_pos))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
158 jobs.append(process)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
159
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
160 for j in jobs:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
161 j.start()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
162
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
163 for j in jobs:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
164 j.join()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
165
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
166 # merging results
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
167 for core_id in range(tParms.core):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
168 if core_id == 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
169 termini_coverage = return_dict[core_id][0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
170 whole_coverage = return_dict[core_id][1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
171 paired_whole_coverage = return_dict[core_id][2]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
172 phage_hybrid_coverage = return_dict[core_id][3]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
173 host_hybrid_coverage = return_dict[core_id][4]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
174 host_whole_coverage = return_dict[core_id][5]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
175 list_hybrid = return_dict[core_id][6]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
176 insert = return_dict[core_id][7].tolist()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
177 paired_missmatch = return_dict[core_id][8]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
178 reads_tested = return_dict[core_id][9]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
179 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
180 termini_coverage += return_dict[core_id][0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
181 whole_coverage += return_dict[core_id][1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
182 paired_whole_coverage += return_dict[core_id][2]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
183 phage_hybrid_coverage += return_dict[core_id][3]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
184 host_hybrid_coverage += return_dict[core_id][4]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
185 host_whole_coverage += return_dict[core_id][5]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
186 list_hybrid += return_dict[core_id][6]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
187 insert += return_dict[core_id][7].tolist()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
188 paired_missmatch += return_dict[core_id][8]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
189 reads_tested += return_dict[core_id][9]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
190
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
191 termini_coverage = termini_coverage.tolist()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
192 whole_coverage = whole_coverage.tolist()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
193 paired_whole_coverage = paired_whole_coverage.tolist()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
194 phage_hybrid_coverage = phage_hybrid_coverage.tolist()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
195 host_hybrid_coverage = host_hybrid_coverage.tolist()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
196 host_whole_coverage = host_whole_coverage.tolist()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
197 list_hybrid = list_hybrid.tolist()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
198
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
199
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
200 # Estimate fParms.virome run time
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
201 if fParms.virome:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
202 end_run = time.time()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
203 virome_run = int((end_run - start_run) * inDArgs.nbr_virome)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
204 print("\n\nThe fasta file tested contains: " + str(inDArgs.nbr_virome) + " contigs (mean length: " + str(
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
205 inDArgs.mean_virome) + ")")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
206 print("\nA complete run takes approximatively (" + str(tParms.core) + " core used) : " + EstimateTime(
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
207 virome_run) + "\n")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
208 exit()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
209
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
210 # Contigs without any match
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
211 if sum(termini_coverage[0]) + sum(termini_coverage[1]) == 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
212 no_match.append((checkReportTitle(inDArgs.refseq_name[results_pos])))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
213 continue
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
214
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
215 s_stats=processCovValuesForSeq(refseq,inDArgs.hostseq,inDArgs.refseq_name,inDArgs.refseq_liste,fParms.seed,inRawDArgs.analysis_name,inRawDArgs.tot_reads,\
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
216 results_pos,fParms.test_run, inRawDArgs.paired,fParms.edge,inRawDArgs.host,fParms.test, fParms.surrounding,\
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
217 fParms.limit_preferred,fParms.limit_fixed,fParms.Mu_threshold,termini_coverage,whole_coverage,\
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
218 paired_whole_coverage,phage_hybrid_coverage,host_hybrid_coverage, host_whole_coverage,insert,list_hybrid,reads_tested,DR)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
219
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
220
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
221 results_pos += 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
222
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
223
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
224
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
225 ### EXPORT Data
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
226 if len(inDArgs.refseq_liste) == 1:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
227 # Test No Match
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
228 if len(no_match) == 1:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
229 print("\n\nERROR: No reads match, please check your reference file.")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
230 exit()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
231
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
232 # Text report only
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
233 if fParms.workflow:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
234 WorkflowReport(inRawDArgs.analysis_name, s_stats.P_class, s_stats.P_left, s_stats.P_right, s_stats.P_type, s_stats.P_orient, s_stats.ave_whole_cov)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
235 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
236 # Statistics
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
237 ExportStatistics(inRawDArgs.analysis_name, whole_coverage, paired_whole_coverage, termini_coverage, s_stats.phage_plus_norm, s_stats.phage_minus_norm, inRawDArgs.paired, fParms.test_run)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
238
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
239 # Sequence
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
240 ExportCohesiveSeq(inRawDArgs.analysis_name, s_stats.ArtcohesiveSeq, s_stats.P_seqcoh, fParms.test_run)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
241 ExportPhageSequence(inRawDArgs.analysis_name, s_stats.P_left, s_stats.P_right, refseq, s_stats.P_orient, s_stats.Redundant, s_stats.Mu_like, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
242 s_stats.P_class, s_stats.P_seqcoh, fParms.test_run)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
243
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
244 # Report
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
245 # TODO: just pass s_stat as argument; it will be cleaner.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
246 CreateReport(inRawDArgs.analysis_name, fParms.seed, s_stats.added_whole_coverage, draw, s_stats.Redundant, s_stats.P_left, s_stats.P_right, s_stats.Permuted, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
247 s_stats.P_orient, s_stats.termini_coverage_norm_close, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
248 s_stats.picMaxPlus_norm_close, s_stats.picMaxMinus_norm_close, s_stats.gen_len, inRawDArgs.tot_reads, s_stats.P_seqcoh, s_stats.phage_plus_norm, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
249 s_stats.phage_minus_norm, s_stats.ArtPackmode, s_stats.termini, s_stats.forward, s_stats.reverse, s_stats.ArtOrient, s_stats.ArtcohesiveSeq, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
250 s_stats.termini_coverage_close, s_stats.picMaxPlus_close, s_stats.picMaxMinus_close, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
251 s_stats.picOUT_norm_forw, s_stats.picOUT_norm_rev, s_stats.picOUT_forw, s_stats.picOUT_rev, s_stats.lost_perc, s_stats.ave_whole_cov, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
252 s_stats.R1, s_stats.R2, s_stats.R3, inRawDArgs.host, len(inDArgs.hostseq), host_whole_coverage, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
253 s_stats.picMaxPlus_host, s_stats.picMaxMinus_host, fParms.surrounding, s_stats.drop_cov, inRawDArgs.paired, insert, phage_hybrid_coverage,\
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
254 host_hybrid_coverage, s_stats.added_paired_whole_coverage, s_stats.Mu_like, fParms.test_run, s_stats.P_class, s_stats.P_type, s_stats.P_concat)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
255
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
256 if (inRawDArgs.nrt==True): # non regression tests, dump phage class name into file for later checking.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
257 fnrt=open("nrt.txt","w")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
258 fnrt.write(s_stats.P_class)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
259 fnrt.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
260 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
261 # Test No Match
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
262 if len(no_match) == inDArgs.nbr_virome:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
263 print("\n\nERROR: No reads match, please check your reference file.")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
264 exit()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
265
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
266 # Report Resume
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
267 multiReport = SummaryReport(inRawDArgs.analysis_name, DR, no_match)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
268 multiCohSeq = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
269 multiPhageSeq = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
270 multiWorkflow = "#analysis_name\tClass\tLeft\tPVal\tAdjPval\tRight\tPVal\tAdjPval\tType\tOrient\tCoverage\tComments\n"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
271
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
272 # No Match in workflow
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
273 if fParms.workflow:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
274 for no_match_contig in no_match:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
275 multiWorkflow += WorkflowReport(no_match_contig, "-", "-", "-", "-", "-", 0, 1)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
276
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
277 for DPC in DR:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
278 for DC in DR[DPC]:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
279 stat_dict = DR[DPC][DC] # splat this in everywhere
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
280 # Text report
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
281 if fParms.workflow:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
282 multiWorkflow += WorkflowReport(phagename=DC, multi=1, **stat_dict)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
283 # Sequence
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
284 idx_refseq=DR[DPC][DC]["idx_refseq_in_list"]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
285 refseq=inDArgs.refseq_liste[idx_refseq]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
286 multiCohSeq += ExportCohesiveSeq(DC, stat_dict["ArtcohesiveSeq"], stat_dict["P_seqcoh"], fParms.test_run, 1)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
287 multiPhageSeq += ExportPhageSequence(DC, stat_dict["P_left"], stat_dict["P_right"], refseq, stat_dict["P_orient"], stat_dict["Redundant"], stat_dict["Mu_like"], stat_dict["P_class"], stat_dict["P_seqcoh"], fParms.test_run, 1)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
288
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
289 # Report
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
290 multiReport = CreateReport(phagename=DC,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
291 draw=draw,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
292 multi=1,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
293 multiReport=multiReport,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
294 **stat_dict)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
295
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
296 # Workflow
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
297 if not fParms.test:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
298 if fParms.workflow:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
299 filoutWorkflow = open(inRawDArgs.analysis_name + "_workflow.txt", "w")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
300 filoutWorkflow.write(multiWorkflow)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
301 filoutWorkflow.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
302
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
303 # Concatene Sequences
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
304 filoutCohSeq = open(inRawDArgs.analysis_name + "_cohesive-sequence.fasta", "w")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
305 filoutCohSeq.write(multiCohSeq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
306 filoutCohSeq.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
307
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
308 filoutPhageSeq = open(inRawDArgs.analysis_name + "_sequence.fasta", "w")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
309 filoutPhageSeq.write(multiPhageSeq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
310 filoutPhageSeq.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
311
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
312 # Concatene Report
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
313 doc = SimpleDocTemplate("%s_PhageTerm_report.pdf" % inRawDArgs.analysis_name, pagesize=letter, rightMargin=10,leftMargin=10, topMargin=5, bottomMargin=10)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
314 doc.build(multiReport)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
315
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
316
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
317 # Real virome run time
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
318 end_run = time.time()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
319 virome_run = int(end_run-start_run)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
320 print("\nThe fasta file tested contains: " + str(inDArgs.nbr_virome) + " contigs (mean length: " + str(inDArgs.mean_virome) + ")")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
321 print("The run has taken (" + str(tParms.core) + " core used) : " + EstimateTime(virome_run) + "\n")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
322 exit()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
323
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
324
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
325
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
326 if __name__ == '__main__':
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
327 main()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
328
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
329
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
330
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
331
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
332
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
333
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
334
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
335
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
336