Mercurial > repos > mmonot > phageterm
comparison _modules/functions_PhageTerm.py @ 24:c8f88ae512f3 draft default tip
Uploaded
author | mmonot |
---|---|
date | Tue, 17 Sep 2024 13:35:16 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
23:26ceb3225190 | 24:c8f88ae512f3 |
---|---|
1 #! /usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # This file is a part of PhageTerm software | |
5 # A tool to determine phage termini and packaging strategy | |
6 # and other useful informations using raw sequencing reads. | |
7 # (This programs works with sequencing reads from a randomly | |
8 # sheared DNA library preparations as Illumina TruSeq paired-end or similar) | |
9 # | |
10 # ---------------------------------------------------------------------- | |
11 # Copyright (C) 2017 Julian Garneau | |
12 # | |
13 # This program is free software; you can redistribute it and/or modify | |
14 # it under the terms of the GNU General Public License as published by | |
15 # the Free Software Foundation; either version 3 of the License, or | |
16 # (at your option) any later version. | |
17 # <http://www.gnu.org/licenses/gpl-3.0.html> | |
18 # | |
19 # This program is distributed in the hope that it will be useful, | |
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 # GNU General Public License for more details. | |
23 # ---------------------------------------------------------------------- | |
24 # | |
25 # @author Julian Garneau <julian.garneau@usherbrooke.ca> | |
26 # @author Marc Monot <marc.monot@pasteur.fr> | |
27 # @author David Bikard <david.bikard@pasteur.fr> | |
28 | |
29 | |
30 ### PYTHON Module | |
31 # Base | |
32 import os | |
33 import shlex | |
34 import subprocess | |
35 import re | |
36 import random | |
37 import sys | |
38 import struct | |
39 import heapq | |
40 import itertools | |
41 import gzip | |
42 | |
43 import matplotlib | |
44 matplotlib.use('Agg') | |
45 import matplotlib.pyplot as plt | |
46 from matplotlib import patches | |
47 from matplotlib.path import Path | |
48 | |
49 import numpy as np | |
50 import pandas as pd | |
51 import pylab as pl | |
52 | |
53 # Statistics | |
54 from scipy import stats | |
55 from statsmodels.sandbox.stats.multicomp import multipletests | |
56 from sklearn.tree import DecisionTreeRegressor | |
57 | |
58 # collections | |
59 from collections import OrderedDict, namedtuple | |
60 | |
61 # String | |
62 from string import maketrans | |
63 import cStringIO | |
64 | |
65 # PDF report building | |
66 import time | |
67 from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER, TA_LEFT, TA_RIGHT | |
68 from reportlab.lib.pagesizes import letter, landscape | |
69 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle, PageBreak | |
70 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
71 from reportlab.lib.units import inch | |
72 from reportlab.lib import colors | |
73 from reportlab.lib.utils import ImageReader | |
74 from reportlab.pdfgen import canvas | |
75 | |
76 | |
77 ### FILE function | |
78 def checkFastaFile(filin): | |
79 """Check sequence Fasta file given by user""" | |
80 first_line = 1 | |
81 infil = open(filin, 'rU') | |
82 try: | |
83 for line in infil: | |
84 # Test '>' | |
85 if first_line : | |
86 if line[0] != '>': | |
87 return 1 | |
88 else: | |
89 first_line = 0 | |
90 continue | |
91 # Test 1st base per line : 'ATGCN' | |
92 base = changeCase(line[0]) | |
93 if base != 'A' and base != 'T' and base != 'C' and base != 'G' and base != 'N' and base != '\n' and base != '\r': | |
94 infil.close() | |
95 return 1 | |
96 infil.close() | |
97 return 0 | |
98 except IOError: | |
99 sys.exit('ERROR: No such file %s' % filin) | |
100 | |
101 def checkPhageName(phagename): | |
102 """Normalise phagename (take out any special char)""" | |
103 phagenameNorm = "" | |
104 charok = range(48,58) + range(65,91) + range(97,123) + [45,95] | |
105 for char in phagename: | |
106 if ord(char) in charok: | |
107 phagenameNorm += char | |
108 if len(phagenameNorm) > 1: | |
109 return phagenameNorm | |
110 else: | |
111 return "Phage" | |
112 | |
113 | |
114 ### UTILITY function | |
115 def chunks(l, n): | |
116 """Yield n successive chunks from l.""" | |
117 newn = int(1.0 * len(l) / n + 0.5) | |
118 for i in xrange(0, n-1): | |
119 yield l[i*newn:i*newn+newn] | |
120 yield l[n*newn-newn:] | |
121 | |
122 | |
123 ### SEQUENCE parsing function | |
124 def genomeFastaRecovery(filin): | |
125 """Get genome sequence from fasta file""" | |
126 if filin == "": | |
127 return '' | |
128 infile = open(filin, 'r') | |
129 genome_forward = ''.join(map(str,[changeCase(line).replace(' ', '').split('\r')[0].split('\n')[0] for line in infile if line[0] != '>'])) | |
130 infile.close() | |
131 return genome_forward | |
132 | |
133 def RemoveEdge(tableau, edge): | |
134 return tableau[edge:-edge] | |
135 | |
136 | |
137 ### SEQUENCE manipulation function | |
138 def changeCase(seq): | |
139 """Change lower case to UPPER CASE for a sequence string.""" | |
140 return seq.upper() | |
141 | |
142 def reverseComplement(seq, transtab=maketrans('ATGCN', 'TACGN')): | |
143 """Reverse Complement a sequence.""" | |
144 return changeCase(seq).translate(transtab)[::-1] | |
145 | |
146 | |
147 ### COVERAGE Starting and Whole function | |
148 def readsCoverage(fastq, refseq, hostseq, tot_reads, seed, edge, paired, insert_max, core, core_id, return_dict, line_start, line_end, limit_coverage): | |
149 """Calculate whole coverage and first base coverage. """ | |
150 gen_len = len(refseq) | |
151 host_len = len(hostseq) | |
152 termini_coverage = np.array([gen_len*[0], gen_len*[0]]) | |
153 whole_coverage = np.array([gen_len*[0], gen_len*[0]]) | |
154 paired_whole_coverage = np.array([gen_len*[0], gen_len*[0]]) | |
155 phage_hybrid_coverage = np.array([gen_len*[0], gen_len*[0]]) | |
156 host_hybrid_coverage = np.array([host_len*[0], host_len*[0]]) | |
157 host_whole_coverage = np.array([host_len*[0], host_len*[0]]) | |
158 list_hybrid = np.array([0,0]) | |
159 insert = [] | |
160 paired_missmatch = 0 | |
161 read_match = 0 | |
162 test_read_seq = match = k = count_line = 0 | |
163 | |
164 | |
165 # Timer | |
166 if core_id == (core-1): | |
167 sys.stdout.write(" 0.0 %") | |
168 sys.stdout.flush() | |
169 | |
170 # Mapping | |
171 if fastq.endswith('.gz'): | |
172 filin = gzip.open(fastq, 'rb') | |
173 else: | |
174 filin = open(fastq) | |
175 line = filin.readline() | |
176 | |
177 if paired != "": | |
178 if paired.endswith('.gz'): | |
179 filin_paired = gzip.open(paired, 'rb') | |
180 else: | |
181 filin_paired = open(paired) | |
182 line_paired = filin_paired.readline() | |
183 | |
184 | |
185 while line: | |
186 | |
187 count_line += 1 | |
188 | |
189 if count_line/4 <= line_start: | |
190 test_read_seq = 0 | |
191 if count_line/4 > line_end: | |
192 break | |
193 | |
194 if test_read_seq: | |
195 k += 1 | |
196 | |
197 # Read sequence | |
198 read = line.split("\n")[0].split("\r")[0] | |
199 line = filin.readline() | |
200 | |
201 if paired != "": | |
202 read_paired = line_paired.split("\n")[0].split("\r")[0] | |
203 line_paired = filin_paired.readline() | |
204 | |
205 readlen = len(read) | |
206 if readlen < seed: | |
207 continue | |
208 corlen = readlen-seed | |
209 | |
210 ### Match sense + (multiple, random pick one) | |
211 matchPplus_start, matchPplus_end = applyCoverage(read[:seed], refseq) | |
212 | |
213 ## Phage | |
214 if matchPplus_start != -1: | |
215 match = 1 | |
216 termini_coverage[0][matchPplus_start]+=1 | |
217 position_end = matchPplus_end+corlen | |
218 | |
219 # whole coverage | |
220 for i in range(matchPplus_start, min(gen_len,position_end)): | |
221 whole_coverage[0][i]+=1 | |
222 | |
223 # Paired-read | |
224 if paired != "": | |
225 matchPplus_start_paired, matchPplus_end_paired = applyCoverage(reverseComplement(read_paired)[-seed:], refseq) | |
226 insert_length = matchPplus_end_paired - matchPplus_start | |
227 if insert_length > 0 and insert_length < insert_max: | |
228 position_end = matchPplus_end_paired | |
229 insert.append(insert_length) | |
230 else: | |
231 paired_missmatch += 1 | |
232 # Paired hybrid | |
233 if hostseq != "" and matchPplus_start_paired == -1: | |
234 matchHplus_start, matchHplus_end = applyCoverage(read_paired[:seed], hostseq) | |
235 if matchHplus_start != -1: | |
236 list_hybrid[0] += 1 | |
237 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) ) | |
238 host_hybrid_coverage[0] = hybridCoverage(read_paired, hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) ) | |
239 else: | |
240 matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[:seed], hostseq) | |
241 if matchHminus_start != -1: | |
242 list_hybrid[0] += 1 | |
243 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) ) | |
244 host_hybrid_coverage[1] = hybridCoverage(reverseComplement(read_paired), hostseq, host_hybrid_coverage[1], matchHminus_start, min(host_len,matchHminus_end+corlen) ) | |
245 | |
246 # Single hybrid | |
247 elif hostseq != "": | |
248 matchPFplus_start, matchPFplus_end = applyCoverage(read[-seed:], refseq) | |
249 if matchPFplus_start == -1: | |
250 matchHplus_start, matchHplus_end = applyCoverage(read[-seed:], hostseq) | |
251 if matchHplus_start != -1: | |
252 list_hybrid[0] += 1 | |
253 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) ) | |
254 host_hybrid_coverage[0] = hybridCoverage(read, hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) ) | |
255 else: | |
256 matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-seed:], hostseq) | |
257 if matchHminus_start != -1: | |
258 list_hybrid[0] += 1 | |
259 phage_hybrid_coverage[0] = hybridCoverage(read, refseq, phage_hybrid_coverage[0], matchPplus_start, min(gen_len,matchPplus_end+corlen) ) | |
260 host_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), hostseq, host_hybrid_coverage[1], matchHminus_start, min(host_len,matchHminus_end+corlen) ) | |
261 | |
262 # whole coverage | |
263 for i in range(matchPplus_start, min(gen_len,position_end)): | |
264 paired_whole_coverage[0][i]+=1 | |
265 | |
266 ### if no match sense +, then test sense - | |
267 if not match: | |
268 matchPminus_start, matchPminus_end = applyCoverage(reverseComplement(read)[-seed:], refseq) | |
269 | |
270 ## Phage | |
271 if matchPminus_end != -1: | |
272 match = 1 | |
273 termini_coverage[1][matchPminus_end-1]+=1 | |
274 position_start = matchPminus_start-corlen | |
275 | |
276 # whole coverage | |
277 for i in range(max(0,position_start), matchPminus_end): | |
278 whole_coverage[1][i]+=1 | |
279 | |
280 # Paired-read | |
281 if paired != "": | |
282 matchPminus_start_paired, matchPminus_end_paired = applyCoverage(read_paired[:seed], refseq) | |
283 insert_length = matchPminus_end - matchPminus_start_paired | |
284 if insert_length > 0 and insert_length < insert_max: | |
285 position_start = matchPminus_start_paired | |
286 insert.append(insert_length) | |
287 else: | |
288 paired_missmatch += 1 | |
289 # Paired hybrid | |
290 if hostseq != "" and matchPminus_start_paired == -1: | |
291 matchHplus_start, matchHplus_end = applyCoverage(read_paired[:seed], hostseq) | |
292 if matchHplus_start != -1: | |
293 list_hybrid[1] += 1 | |
294 phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) ) | |
295 host_hybrid_coverage[0] = hybridCoverage(read_paired, hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) ) | |
296 | |
297 else: | |
298 matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read_paired)[-seed:], hostseq) | |
299 if matchHminus_start != -1: | |
300 list_hybrid[1] += 1 | |
301 phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) ) | |
302 host_hybrid_coverage[1] = hybridCoverage(reverseComplement(read_paired), hostseq, host_hybrid_coverage[1], matchHminus_start, min(host_len,matchHminus_end+corlen) ) | |
303 | |
304 # Single hybrid | |
305 elif hostseq != "": | |
306 matchPRplus_start, matchPRplus_end = applyCoverage(reverseComplement(read)[:seed], refseq) | |
307 if matchPRplus_start == -1: | |
308 matchHplus_start, matchHplus_end = applyCoverage(read[:seed], hostseq) | |
309 if matchHplus_start != -1: | |
310 list_hybrid[1] += 1 | |
311 phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) ) | |
312 host_hybrid_coverage[0] = hybridCoverage(read, hostseq, host_hybrid_coverage[0], matchHplus_start, min(host_len,matchHplus_end+corlen) ) | |
313 else: | |
314 matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[:seed], hostseq) | |
315 if matchHminus_start != -1: | |
316 list_hybrid[1] += 1 | |
317 phage_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), refseq, phage_hybrid_coverage[1], matchPminus_start, min(gen_len,matchPminus_end+corlen) ) | |
318 host_hybrid_coverage[1] = hybridCoverage(reverseComplement(read), hostseq, host_hybrid_coverage[1], matchHminus_start, min(host_len,matchHminus_end+corlen) ) | |
319 | |
320 # whole coverage | |
321 for i in range(max(0,position_start), matchPminus_end): | |
322 paired_whole_coverage[1][i]+=1 | |
323 | |
324 ### if no match on Phage, test Host | |
325 if not match: | |
326 matchHplus_start, matchHplus_end = applyCoverage(read[:seed], hostseq) | |
327 if matchHplus_start != -1: | |
328 for i in range(matchHplus_start, min(host_len,matchHplus_end+corlen)): | |
329 host_whole_coverage[0][i]+=1 | |
330 else: | |
331 matchHminus_start, matchHminus_end = applyCoverage(reverseComplement(read)[-seed:], hostseq) | |
332 if matchHminus_end != -1: | |
333 for i in range(max(0,matchHminus_start-corlen), matchHminus_end): | |
334 host_whole_coverage[1][i]+=1 | |
335 | |
336 # TEST limit_coverage | |
337 read_match += match*readlen | |
338 | |
339 match = test_read_seq = 0 | |
340 # Timer | |
341 if core_id == (core-1): | |
342 if k%1000 == 0: | |
343 sys.stdout.write("\b\b\b\b\b\b\b\b\b%3.1f %%" %( float(read_match/gen_len) / limit_coverage * 100 )) | |
344 sys.stdout.flush() | |
345 | |
346 else: | |
347 if line[0] == "@": | |
348 test_read_seq = 1 | |
349 | |
350 line = filin.readline() | |
351 if paired != "": | |
352 line_paired = filin_paired.readline() | |
353 | |
354 # TEST limit_coverage | |
355 if (read_match/gen_len) > limit_coverage: | |
356 line = 0 | |
357 | |
358 | |
359 if core_id == (core-1): | |
360 sys.stdout.write("\b\b\b\b\b\b\b\b\b%3.1f %%" %( 100 )) | |
361 sys.stdout.flush() | |
362 | |
363 # Close file | |
364 filin.close() | |
365 if paired != "": | |
366 filin_paired.close() | |
367 | |
368 | |
369 # Correct EDGE coverage | |
370 if sum(termini_coverage[0]) + sum(termini_coverage[1]) == 0: | |
371 exit("ERROR: No read Match, please check your fastq file") | |
372 | |
373 termini_coverage = correctEdge(termini_coverage, edge) | |
374 whole_coverage = correctEdge(whole_coverage, edge) | |
375 phage_hybrid_coverage = correctEdge(phage_hybrid_coverage, edge) | |
376 if hostseq != "": | |
377 host_whole_coverage = correctEdge(host_whole_coverage, edge) | |
378 host_hybrid_coverage = correctEdge(host_hybrid_coverage, edge) | |
379 | |
380 insert = np.array(insert) | |
381 | |
382 return_dict[core_id] = [termini_coverage, whole_coverage, paired_whole_coverage, phage_hybrid_coverage, host_hybrid_coverage, host_whole_coverage, list_hybrid, insert, paired_missmatch, k] | |
383 | |
384 return | |
385 | |
386 def hybridCoverage(read, sequence, hybrid_coverage, start, end): | |
387 """Return hybrid coverage.""" | |
388 aligned_part_only = longest_common_substring(read, sequence[start:end]) | |
389 for i in range(start, min(len(sequence),start+len(aligned_part_only))): | |
390 hybrid_coverage[i]+=1 | |
391 return hybrid_coverage | |
392 | |
393 def applyCoverage(readPart, sequence): | |
394 """Return a random match of a read onto the sequence. """ | |
395 position = [] | |
396 for pos in re.finditer(readPart,sequence): | |
397 position.append(pos) | |
398 if len(position) > 0: | |
399 match = random.choice(position) | |
400 return match.start(), match.end() | |
401 else: | |
402 return -1, -1 | |
403 | |
404 def correctEdge(coverage, edge): | |
405 """Correction of the Edge coverage. """ | |
406 correctCov = np.array([len(coverage[0])*[0], len(coverage[0])*[0]]) | |
407 End = len(coverage[0]) | |
408 covSta = range(edge) | |
409 covEnd = range(End-edge,End) | |
410 for i in range(len(coverage)): | |
411 for j in range(len(coverage[i])): | |
412 correctCov[i][j] = coverage[i][j] | |
413 for k in covSta: | |
414 correctCov[i][k+edge] += coverage[i][k+End-edge] | |
415 for l in covEnd: | |
416 correctCov[i][l-edge] += coverage[i][l-End+edge] | |
417 return correctCov | |
418 | |
419 def normCov(termini_coverage, whole_coverage, covLimit, edge): | |
420 """Return the termini_coverage normalised by the whole coverage (% of coverage due to first base).""" | |
421 normalised_coverage = [len(termini_coverage[0])*[0], len(termini_coverage[0])*[0]] | |
422 termini_len = len(termini_coverage[0]) | |
423 mean_nc = [1,1] | |
424 for i in range(len(termini_coverage)): | |
425 for j in range(len(termini_coverage[i])): | |
426 if j < edge or j > termini_len-edge: | |
427 continue | |
428 if whole_coverage[i][j] >= covLimit: | |
429 if float(whole_coverage[i][j]) != 0: | |
430 normalised_coverage[i][j] = float(termini_coverage[i][j]) / float(whole_coverage[i][j]) | |
431 else: | |
432 normalised_coverage[i][j] = 0 | |
433 else: | |
434 normalised_coverage[i][j] = 0 | |
435 normalised_coverage[i], mean_nc[i] = replaceNormMean(normalised_coverage[i]) | |
436 return normalised_coverage, mean_nc | |
437 | |
438 def maxPaired(paired_whole_coverage, whole_coverage): | |
439 """Max paired coverage using whole coverage, counter edge effect with paired-ends.""" | |
440 pwc = paired_whole_coverage[:] | |
441 wc = whole_coverage[:] | |
442 for i in range(len(pwc)): | |
443 for j in range(len(pwc[i])): | |
444 if pwc[i][j] < wc[i][j]: | |
445 pwc[i][j] = wc[i][j] | |
446 return pwc | |
447 | |
448 def replaceNormMean(norm_cov): | |
449 """Replace the values not normalised due to covLimit by mean.""" | |
450 nc_sum = nc_count = 0 | |
451 for nc in norm_cov: | |
452 if nc > 0: | |
453 nc_sum += nc | |
454 nc_count += 1 | |
455 if nc_count == 0: | |
456 mean_nc = 0 | |
457 else: | |
458 mean_nc = nc_sum / float(nc_count) | |
459 for i in range(len(norm_cov)): | |
460 if norm_cov[i] == 0: | |
461 norm_cov[i] = mean_nc | |
462 return norm_cov, mean_nc | |
463 | |
464 def wholeCov(whole_coverage, gen_len): | |
465 """Calculate the coverage for whole read alignments and its average""" | |
466 if gen_len == 0: | |
467 return whole_coverage, 1 | |
468 total_cov = sum(whole_coverage[0]) + sum(whole_coverage[1]) | |
469 ave_whole_cov = float(total_cov)/(2*float(gen_len)) | |
470 added_whole_coverage = [x + y for x, y in zip(whole_coverage[0], whole_coverage[1])] | |
471 return added_whole_coverage, ave_whole_cov | |
472 | |
473 def testwholeCov(added_whole_coverage, ave_whole_cov, test): | |
474 """Return information about whole coverage.""" | |
475 if test: | |
476 return "" | |
477 if ave_whole_cov < 50: | |
478 print "\nWARNING: average coverage is under the limit of the software (50)" | |
479 elif ave_whole_cov < 200: | |
480 print "\nWARNING: average coverage is low (<200), Li's method is presumably unreliable\n" | |
481 drop_cov = [] | |
482 start_pos = last_pos = count_pos = 0 | |
483 for pos in range(len(added_whole_coverage)): | |
484 if added_whole_coverage[pos] < (ave_whole_cov / 1.5): | |
485 if pos == last_pos+1: | |
486 count_pos += 1 | |
487 last_pos = pos | |
488 else: | |
489 if count_pos > 100: | |
490 drop_cov.append( (start_pos,last_pos+1) ) | |
491 last_pos = start_pos = pos | |
492 count_pos = 0 | |
493 last_pos = pos | |
494 return drop_cov | |
495 | |
496 def longest_common_substring(read, refseq): | |
497 """Longest common substring between two strings.""" | |
498 m = [[0] * (1 + len(refseq)) for i in xrange(1 + len(read))] | |
499 longest, x_longest = 0, 0 | |
500 for x in xrange(1, 1 + len(read)): | |
501 for y in xrange(1, 1 + len(refseq)): | |
502 if read[x - 1] == refseq[y - 1]: | |
503 m[x][y] = m[x - 1][y - 1] + 1 | |
504 if m[x][y] > longest: | |
505 longest = m[x][y] | |
506 x_longest = x | |
507 else: | |
508 m[x][y] = 0 | |
509 return read[x_longest - longest: x_longest] | |
510 | |
511 | |
512 ### READS Functions | |
513 def totReads(filin): | |
514 """Verify and retrieve the number of reads in the fastq file before alignment""" | |
515 if filin.endswith('.gz'): | |
516 filein = gzip.open(filin, 'rb') | |
517 else: | |
518 filein = open(filin, 'r') | |
519 | |
520 line = 0 | |
521 while filein.readline(): | |
522 line += 1 | |
523 seq = float(round(line / 4)) | |
524 filein.close() | |
525 return seq | |
526 | |
527 def usedReads(coverage, tot_reads): | |
528 """Retrieve the number of reads after alignment and calculate the percentage of reads lost.""" | |
529 used_reads = sum(coverage[0]) + sum(coverage[1]) | |
530 lost_reads = tot_reads - used_reads | |
531 lost_perc = (float(tot_reads) - float(used_reads))/float(tot_reads) * 100 | |
532 return used_reads, lost_reads, lost_perc | |
533 | |
534 | |
535 ### PEAK functions | |
536 def picMax(coverage, nbr_pic): | |
537 """COORDINATES (coverage value, position) of the nbr_pic largest coverage value.""" | |
538 if coverage == [[],[]] or coverage == []: | |
539 return "", "", "" | |
540 picMaxPlus = heapq.nlargest(nbr_pic, zip(coverage[0], itertools.count())) | |
541 picMaxMinus = heapq.nlargest(nbr_pic, zip(coverage[1], itertools.count())) | |
542 TopFreqH = max(max(np.array(zip(*picMaxPlus)[0])), max(np.array(zip(*picMaxMinus)[0]))) | |
543 return picMaxPlus, picMaxMinus, TopFreqH | |
544 | |
545 | |
546 ### SCORING functions | |
547 # Li's methodology | |
548 def ratioR1(TopFreqH, used_reads, gen_len): | |
549 """Calculate the ratio H/A (R1) = highest frequency/average frequency. For Li's methodology.""" | |
550 AveFreq = (float(used_reads)/float(gen_len)/2) | |
551 if AveFreq == 0: | |
552 return 0, 0 | |
553 R1 = float(TopFreqH)/float(AveFreq) | |
554 return R1, AveFreq | |
555 | |
556 def ratioR(picMax): | |
557 """Calculate the T1/T2 = Top 1st frequency/Second higher frequency. For Li's methodology.""" | |
558 T1 = picMax[0][0] | |
559 T2 = max(1,picMax[1][0]) | |
560 R = float(T1)/float(T2) | |
561 return round(R) | |
562 | |
563 def packMode(R1, R2, R3): | |
564 """Make the prognosis about the phage packaging mode and termini type. For Li's methodology.""" | |
565 packmode = "OTHER" | |
566 termini = "" | |
567 forward = "" | |
568 reverse = "" | |
569 | |
570 if R1 < 30: | |
571 termini = "Absence" | |
572 if R2 < 3: | |
573 forward = "No Obvious Termini" | |
574 if R3 < 3: | |
575 reverse = "No Obvious Termini" | |
576 elif R1 > 100: | |
577 termini = "Fixed" | |
578 if R2 < 3: | |
579 forward = "Multiple-Pref. Term." | |
580 if R3 < 3: | |
581 reverse = "Multiple-Pref. Term." | |
582 else: | |
583 termini = "Preferred" | |
584 if R2 < 3: | |
585 forward = "Multiple-Pref. Term." | |
586 if R3 < 3: | |
587 reverse = "Multiple-Pref. Term." | |
588 | |
589 if R2 >= 3: | |
590 forward = "Obvious Termini" | |
591 if R3 >= 3: | |
592 reverse = "Obvious Termini" | |
593 | |
594 if R2 >= 3 and R3 >= 3: | |
595 packmode = "COS" | |
596 if R2 >= 3 and R3 < 3: | |
597 packmode = "PAC" | |
598 if R2 < 3 and R3 >= 3: | |
599 packmode = "PAC" | |
600 return packmode, termini, forward, reverse | |
601 | |
602 def RemoveClosePicMax(picMax, gen_len, nbr_base): | |
603 """Remove peaks that are too close of the maximum (nbr_base around)""" | |
604 if nbr_base == 0: | |
605 return picMax[1:], [picMax[0]] | |
606 picMaxRC = picMax[:] | |
607 posMax = picMaxRC[0][1] | |
608 LimSup = posMax + nbr_base | |
609 LimInf = posMax - nbr_base | |
610 if LimSup < gen_len and LimInf >= 0: | |
611 PosOut = range(LimInf,LimSup) | |
612 elif LimSup >= gen_len: | |
613 TurnSup = LimSup - gen_len | |
614 PosOut = range(posMax,gen_len)+range(0,TurnSup) + range(LimInf,posMax) | |
615 elif LimInf < 0: | |
616 TurnInf = gen_len + LimInf | |
617 PosOut = range(0,posMax)+range(TurnInf,gen_len) + range(posMax,LimSup) | |
618 picMaxOK = [] | |
619 picOUT = [] | |
620 for peaks in picMaxRC: | |
621 if peaks[1] not in PosOut: | |
622 picMaxOK.append(peaks) | |
623 else: | |
624 picOUT.append(peaks) | |
625 return picMaxOK, picOUT | |
626 | |
627 def addClosePic(picList, picClose, norm = 0): | |
628 """Add coverage value of close peaks to the top peak. Remove picClose in picList if exist.""" | |
629 if norm: | |
630 if picClose[0][0] >= 0.5: | |
631 return picList, [picClose[0]] | |
632 picListOK = picList[:] | |
633 cov_add = 0 | |
634 for cov in picClose: | |
635 cov_add += cov[0] | |
636 picListOK[cov[1]] = 0.01 | |
637 picListOK[picClose[0][1]] = cov_add | |
638 return picListOK, picClose | |
639 | |
640 | |
641 # STATISTICS | |
642 def test_pics_decision_tree(whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close): | |
643 """Fits a gamma distribution using a decision tree.""" | |
644 L=len(whole_coverage[0]) | |
645 res=pd.DataFrame({"Position":np.array(range(L))+1, "termini_plus": termini_coverage[0], "SPC_norm_plus":termini_coverage_norm[0], "SPC_norm_minus":termini_coverage_norm[1], "SPC_norm_plus_close":termini_coverage_norm_close[0], "SPC_norm_minus_close":termini_coverage_norm_close[1], "termini_minus": termini_coverage[1],"cov_plus": whole_coverage[0], "cov_minus": whole_coverage[1]}) | |
646 res["cov"]=res["cov_plus"].values+res["cov_minus"].values | |
647 res["R_plus"]=map(float,termini_coverage[0])/np.mean(termini_coverage[0]) | |
648 res["R_minus"]=map(float,termini_coverage[1])/np.mean(termini_coverage[1]) | |
649 | |
650 regr = DecisionTreeRegressor(max_depth=3,min_samples_leaf=100) | |
651 X = np.arange(L) | |
652 X = X[:,np.newaxis] | |
653 y = res["cov"].values | |
654 regr.fit(X,y) | |
655 | |
656 # Predict | |
657 y_1 = regr.predict(X) | |
658 res["covnode"] = y_1 | |
659 covnodes = np.unique(y_1) | |
660 thres = np.mean(whole_coverage[0])/2 | |
661 covnodes = [n for n in covnodes if n>thres] | |
662 | |
663 for node in covnodes: | |
664 X = res[res["covnode"]== node]["termini_plus"].values | |
665 res.ix[res["covnode"]==node,"pval_plus"] = gamma(X) | |
666 X = res[res["covnode"]==node]["termini_minus"].values | |
667 res.ix[res["covnode"]==node,"pval_minus"] = gamma(X) | |
668 | |
669 res.ix[res.pval_plus > 1, 'pval_plus'] = 1.00 | |
670 res.ix[res.pval_minus > 1, 'pval_minus'] = 1.00 | |
671 res = res.fillna(1.00) | |
672 | |
673 res['pval_plus_adj'] = multipletests(res["pval_plus"].values, alpha=0.01, method="bonferroni")[1] | |
674 res['pval_minus_adj'] = multipletests(res["pval_minus"].values, alpha=0.01, method="bonferroni")[1] | |
675 | |
676 res = res.fillna(1.00) | |
677 | |
678 res_plus = pd.DataFrame({"Position": res['Position'], "SPC_std": res['SPC_norm_plus']*100, "SPC": res['SPC_norm_plus_close']*100, "pval_gamma": res['pval_plus'] , "pval_gamma_adj": res['pval_plus_adj']}) | |
679 res_minus = pd.DataFrame({"Position": res['Position'], "SPC_std": res['SPC_norm_minus']*100, "SPC": res['SPC_norm_minus_close']*100, "pval_gamma": res['pval_minus'] , "pval_gamma_adj": res['pval_minus_adj']}) | |
680 | |
681 res_plus.sort_values("SPC", ascending=False, inplace=True) | |
682 res_minus.sort_values("SPC", ascending=False, inplace=True) | |
683 | |
684 res_plus.reset_index(drop=True, inplace=True) | |
685 res_minus.reset_index(drop=True, inplace=True) | |
686 | |
687 return res, res_plus, res_minus | |
688 | |
689 def gamma(X): | |
690 """Apply a gamma distribution.""" | |
691 X = np.array(X,dtype=np.int64) | |
692 v = remove_pics(X,3) | |
693 | |
694 dist_max = float(max(v)) | |
695 if dist_max == 0: | |
696 return np.array([1.00]*len(X)) | |
697 | |
698 actual = np.bincount(v) | |
699 fit_alpha, fit_loc, fit_beta = stats.gamma.fit(v) | |
700 expected=stats.gamma.pdf(np.arange(0,dist_max+1,1),fit_alpha,loc=fit_loc,scale=fit_beta)*sum(actual) | |
701 | |
702 return stats.gamma.pdf(X,fit_alpha,loc=fit_loc,scale=fit_beta) | |
703 | |
704 def remove_pics(arr,n): | |
705 '''Removes the n highest values from the array''' | |
706 arr=np.array(arr) | |
707 pic_pos=arr.argsort()[-n:][::-1] | |
708 arr2=np.delete(arr,pic_pos) | |
709 return arr2 | |
710 | |
711 def selectSignificant(table, pvalue, limit): | |
712 """Return significant peaks over a limit""" | |
713 table_pvalue = table.loc[lambda df: df.pval_gamma_adj < pvalue,:] | |
714 table_pvalue_limit = table_pvalue.loc[lambda df: df.SPC > limit,:] | |
715 table_pvalue_limit.reset_index(drop=True, inplace=True) | |
716 return table_pvalue_limit | |
717 | |
718 | |
719 ### DECISION Process | |
720 def decisionProcess(plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid, used_reads, seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq): | |
721 """ .""" | |
722 P_orient = "NA" | |
723 P_seqcoh = "" | |
724 P_concat = "" | |
725 P_type = "-" | |
726 Mu_like = 0 | |
727 P_left = "Random" | |
728 P_right = "Random" | |
729 # 2 peaks sig. | |
730 if not plus_significant.empty and not minus_significant.empty: | |
731 # Multiple | |
732 if ( len(plus_significant["SPC"]) > 1 or len(minus_significant["SPC"]) > 1 ): | |
733 if not ( plus_significant["SPC"][0] > limit_fixed or minus_significant["SPC"][0] > limit_fixed ): | |
734 Redundant = 1 | |
735 P_left = "Multiple" | |
736 P_right = "Multiple" | |
737 Permuted = "Yes" | |
738 P_class = "-" | |
739 P_type = "-" | |
740 return Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like | |
741 | |
742 dist_peak = abs(plus_significant['Position'][0] - minus_significant['Position'][0]) | |
743 dist_peak_over = abs( abs(plus_significant['Position'][0] - minus_significant['Position'][0]) - gen_len ) | |
744 P_left = plus_significant['Position'][0] | |
745 P_right = minus_significant['Position'][0] | |
746 # COS | |
747 if ( dist_peak <= 2 ) or ( dist_peak_over <= 2 ): | |
748 Redundant = 0 | |
749 Permuted = "No" | |
750 P_class = "COS" | |
751 P_type = "-" | |
752 elif ( dist_peak < 20 and dist_peak > 2 ) or ( dist_peak_over < 20 and dist_peak_over > 2 ): | |
753 Redundant = 0 | |
754 Permuted = "No" | |
755 P_class, P_type = typeCOS(plus_significant["Position"][0], minus_significant["Position"][0], gen_len/2 ) | |
756 P_seqcoh, packstat = sequenceCohesive("COS", refseq, [((plus_significant["SPC"][0]),(plus_significant["Position"][0])-1)], [((minus_significant["SPC"][0]),(minus_significant["Position"][0])-1)], gen_len/2) | |
757 # DTR | |
758 elif ( dist_peak <= 1000 ) or ( dist_peak_over <= 1000 ): | |
759 Redundant = 1 | |
760 Permuted = "No" | |
761 P_class = "DTR (short)" | |
762 P_type = "T7" | |
763 P_seqcoh, packstat = sequenceCohesive("DTR", refseq, [((plus_significant["SPC"][0]),(plus_significant["Position"][0])-1)], [((minus_significant["SPC"][0]),(minus_significant["Position"][0])-1)], gen_len/2) | |
764 elif ( dist_peak <= 0.1*gen_len ) or ( dist_peak_over <= 0.1*gen_len ): | |
765 Redundant = 1 | |
766 Permuted = "No" | |
767 P_class = "DTR (long)" | |
768 P_type = "T5" | |
769 P_seqcoh, packstat = sequenceCohesive("DTR", refseq, [((plus_significant["SPC"][0]),(plus_significant["Position"][0])-1)], [((minus_significant["SPC"][0]),(minus_significant["Position"][0])-1)], gen_len/2) | |
770 else: | |
771 Redundant = 1 | |
772 Permuted = "No" | |
773 P_class = "-" | |
774 P_type = "-" | |
775 # 1 peak sig. | |
776 elif not plus_significant.empty and minus_significant.empty or plus_significant.empty and not minus_significant.empty: | |
777 Redundant = 1 | |
778 Permuted = "Yes" | |
779 P_class = "Headful (pac)" | |
780 P_type = "P1" | |
781 if paired != "": | |
782 if R1 == 0 or len(insert) == 0: | |
783 P_concat = 1 | |
784 else: | |
785 P_concat = round( (sum(insert)/len(insert)) / R1) - 1 | |
786 if not plus_significant.empty: | |
787 P_left = plus_significant['Position'][0] | |
788 P_right = "Distributed" | |
789 P_orient = "Forward" | |
790 else: | |
791 P_left = "Distributed" | |
792 P_right = minus_significant['Position'][0] | |
793 P_orient = "Reverse" | |
794 # 0 peak sig. | |
795 elif plus_significant.empty and minus_significant.empty: | |
796 Mu_like, Mu_term_plus, Mu_term_minus, P_orient = testMu(paired, list_hybrid, gen_len, used_reads, seed, insert, phage_hybrid_coverage, Mu_threshold, hostseq) | |
797 if Mu_like: | |
798 Redundant = 0 | |
799 Permuted = "No" | |
800 P_class = "Mu-like" | |
801 P_type = "Mu" | |
802 P_left = Mu_term_plus | |
803 P_right = Mu_term_minus | |
804 else: | |
805 Redundant = 1 | |
806 Permuted = "Yes" | |
807 P_class = "-" | |
808 P_type = "-" | |
809 | |
810 return Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like | |
811 | |
812 | |
813 | |
814 def testMu(paired, list_hybrid, gen_len, used_reads, seed, insert, phage_hybrid_coverage, Mu_threshold, hostseq): | |
815 """Return Mu if enough hybrid reads compared to theory.""" | |
816 if hostseq == "": | |
817 return 0, -1, -1, "" | |
818 if paired != "": | |
819 insert_mean = sum(insert) / len(insert) | |
820 else: | |
821 insert_mean = max(100, seed+10) | |
822 Mu_limit = ((insert_mean - seed) / float(gen_len)) * used_reads/2 | |
823 test = 0 | |
824 Mu_term_plus = "Random" | |
825 Mu_term_minus = "Random" | |
826 picMaxPlus_Mu, picMaxMinus_Mu, TopFreqH_phage_hybrid = picMax(phage_hybrid_coverage, 1) | |
827 picMaxPlus_Mu = picMaxPlus_Mu[0][1] | |
828 picMaxMinus_Mu = picMaxMinus_Mu[0][1] | |
829 | |
830 # Orientation | |
831 if list_hybrid[0] > list_hybrid[1]: | |
832 P_orient = "Forward" | |
833 elif list_hybrid[1] > list_hybrid[0]: | |
834 P_orient = "Reverse" | |
835 else: | |
836 P_orient = "" | |
837 | |
838 # Termini | |
839 if list_hybrid[0] > ( Mu_limit * Mu_threshold ): | |
840 test = 1 | |
841 pos_to_check = range(picMaxPlus_Mu+1,gen_len) + range(0,100) | |
842 for pos in pos_to_check: | |
843 if phage_hybrid_coverage[0][pos] >= max(1,phage_hybrid_coverage[0][picMaxPlus_Mu]/4): | |
844 Mu_term_plus = pos | |
845 picMaxPlus_Mu = pos | |
846 else: | |
847 Mu_term_plus = pos | |
848 break | |
849 # Reverse | |
850 if list_hybrid[1] > ( Mu_limit * Mu_threshold ): | |
851 test = 1 | |
852 pos_to_check = range(0,picMaxMinus_Mu)[::-1] + range(gen_len-100,gen_len)[::-1] | |
853 for pos in pos_to_check: | |
854 if phage_hybrid_coverage[1][pos] >= max(1,phage_hybrid_coverage[1][picMaxMinus_Mu]/4): | |
855 Mu_term_minus = pos | |
856 picMaxMinus_Mu = pos | |
857 else: | |
858 Mu_term_minus = pos | |
859 break | |
860 return test, Mu_term_plus, Mu_term_minus, P_orient | |
861 | |
862 | |
863 ### PHAGE Information | |
864 def orientation(picMaxPlus, picMaxMinus): | |
865 """Return phage termini orientation.""" | |
866 if not picMaxPlus and not picMaxMinus: | |
867 return "NA" | |
868 if picMaxPlus and not picMaxMinus: | |
869 return "Forward" | |
870 if not picMaxPlus and picMaxMinus: | |
871 return "Reverse" | |
872 if picMaxPlus and picMaxMinus: | |
873 if picMaxPlus[0][0] > picMaxMinus[0][0]: | |
874 return "Forward" | |
875 elif picMaxMinus[0][0] > picMaxPlus[0][0]: | |
876 return "Reverse" | |
877 elif picMaxMinus[0][0] == picMaxPlus[0][0]: | |
878 return "NA" | |
879 | |
880 def sequenceCohesive(Packmode, refseq, picMaxPlus, picMaxMinus, nbr_lim): | |
881 """Return cohesive sequence for COS phages or repeat region for DTR phages""" | |
882 if Packmode != 'COS' and Packmode != 'DTR': | |
883 return '', Packmode | |
884 PosPlus = picMaxPlus[0][1] | |
885 PosMinus = picMaxMinus[0][1] | |
886 | |
887 if Packmode == 'COS': | |
888 SC_class, SC_type = typeCOS(PosPlus, PosMinus, nbr_lim) | |
889 | |
890 if SC_class == "COS (5')": | |
891 if abs(PosMinus - PosPlus) < nbr_lim: | |
892 seqcoh = refseq[min(PosPlus, PosMinus):max(PosPlus,PosMinus)+1] | |
893 return seqcoh, Packmode | |
894 else: | |
895 seqcoh = refseq[max(PosPlus,PosMinus)+1:] + refseq[:min(PosPlus, PosMinus)] | |
896 return seqcoh, Packmode | |
897 | |
898 elif SC_class == "COS (3')": | |
899 if abs(PosMinus - PosPlus) < nbr_lim: | |
900 seqcoh = refseq[min(PosPlus, PosMinus)+1:max(PosPlus,PosMinus)] | |
901 return seqcoh, Packmode | |
902 else: | |
903 seqcoh = refseq[max(PosPlus,PosMinus)+1:] + refseq[:min(PosPlus, PosMinus)] | |
904 return seqcoh, Packmode | |
905 else: | |
906 return '', Packmode | |
907 | |
908 elif Packmode == 'DTR': | |
909 if PosPlus < PosMinus: | |
910 seqcoh = refseq[PosPlus:PosMinus+1] | |
911 return seqcoh, Packmode | |
912 else: | |
913 seqcoh = refseq[PosPlus:] + refseq[:PosMinus+1] | |
914 return seqcoh, Packmode | |
915 | |
916 def typeCOS(PosPlus, PosMinus, nbr_lim): | |
917 """ Return type of COS sequence.""" | |
918 if PosPlus < PosMinus and abs(PosPlus-PosMinus) < nbr_lim: | |
919 return "COS (5')", "Lambda" | |
920 else: | |
921 return "COS (3')", "HK97" | |
922 | |
923 ### IMAGE Functions | |
924 def GraphCov(termini_coverage, picMaxPlus, picMaxMinus, phagename, norm, draw, hybrid = 0): | |
925 """Produces a plot with termini coverage values.""" | |
926 # Remove old plot | |
927 plt.clf() | |
928 plt.cla() | |
929 plt.close() | |
930 # Create figure | |
931 plt.figure(1) | |
932 term_len = len(termini_coverage[0]) | |
933 term_range = range(term_len) | |
934 if norm == 1: | |
935 ylim = 1.2 | |
936 elif hybrid == 1: | |
937 offset = 0.2*(max(np.array(zip(*picMaxPlus)[0]))) + 1 | |
938 ylim = max(np.array(zip(*picMaxPlus)[0])) + offset | |
939 else: | |
940 offset = 0.2*(max(max(np.array(zip(*picMaxPlus)[0])), max(np.array(zip(*picMaxMinus)[0])))) | |
941 ylim = max(max(np.array(zip(*picMaxPlus)[0])), max(np.array(zip(*picMaxMinus)[0]))) + offset | |
942 # Strand (+) | |
943 plt.subplot(211) | |
944 if norm == 1: | |
945 plt.scatter(term_range,termini_coverage[0]) | |
946 else: | |
947 plt.plot(termini_coverage[0],linewidth=2) | |
948 plt.title('strand (+)') | |
949 plt.ylabel('') | |
950 # Axes | |
951 axes = plt.gca() | |
952 axes.set_ylim([0,ylim]) | |
953 # Maximum | |
954 x_strandplus = np.array(zip(*picMaxPlus)[1]) | |
955 y_strandplus = np.array(zip(*picMaxPlus)[0]) | |
956 # Plot | |
957 plt.plot(x_strandplus, y_strandplus, 'ro') | |
958 if norm == 1: | |
959 axes.axhline(y=0.5, xmin=0, xmax=x_strandplus, color='grey', linestyle='dashed', linewidth=1.5) | |
960 axes.axhline(y=1.0, xmin=0, xmax=x_strandplus, color='grey', linestyle='dashed', linewidth=1.5) | |
961 # Annotation | |
962 for i,j in zip(x_strandplus,y_strandplus): | |
963 plt.text(i+(term_len*0.03), j, str(i+1), fontsize=15, bbox=dict(boxstyle='round', facecolor='white', alpha=1)) | |
964 # Plot Option | |
965 plt.margins(0.1) | |
966 plt.locator_params(axis = 'x', nbins = 10) | |
967 plt.locator_params(axis = 'y', nbins = 3) | |
968 plt.xticks(rotation=75) | |
969 # Strand (-) | |
970 plt.subplot(212) | |
971 if norm == 1: | |
972 plt.scatter(term_range,termini_coverage[1]) | |
973 else: | |
974 plt.plot(termini_coverage[1],linewidth=2) | |
975 plt.title('strand (-)') | |
976 plt.ylabel('') | |
977 # Axes | |
978 if hybrid == 1: | |
979 offset = 0.2*(max(np.array(zip(*picMaxMinus)[0]))) + 1 | |
980 ylim = max(np.array(zip(*picMaxMinus)[0])) + offset | |
981 axes = plt.gca() | |
982 axes.set_ylim([0,ylim]) | |
983 # Maximum | |
984 x_strandminus = np.array(zip(*picMaxMinus)[1]) | |
985 y_strandminus = np.array(zip(*picMaxMinus)[0]) | |
986 # Plot | |
987 plt.plot(x_strandminus, y_strandminus, 'ro') | |
988 if norm == 1: | |
989 axes.axhline(y=0.5, xmin=0, xmax=x_strandplus, color='grey', linestyle='dashed', linewidth=1.5) | |
990 axes.axhline(y=1.0, xmin=0, xmax=x_strandplus, color='grey', linestyle='dashed', linewidth=1.5) | |
991 # Annotation | |
992 for i,j in zip(x_strandminus,y_strandminus): | |
993 plt.text(i+(term_len*0.03), j, str(i+1), fontsize=15, bbox=dict(boxstyle='round', facecolor='white', alpha=1)) | |
994 # Plot Option | |
995 plt.margins(0.1) | |
996 plt.locator_params(axis = 'x', nbins = 10) | |
997 plt.locator_params(axis = 'y', nbins = 3) | |
998 plt.xticks(rotation=75) | |
999 # Plot Adjustments | |
1000 plt.tight_layout() | |
1001 # Draw graph | |
1002 if draw: | |
1003 plt.savefig("%s_TCov.png" % phagename, dpi=200) | |
1004 fig = plt.figure(1) | |
1005 return fig | |
1006 | |
1007 def GraphWholeCov(added_whole_coverage, phagename, draw, P_left = "", P_right = "", pos_left = 0, pos_right = 0, graphZoom = 0, title = "WHOLE COVERAGE"): | |
1008 """Produces a plot with whole coverage values.""" | |
1009 # Remove old plot | |
1010 plt.clf() | |
1011 plt.cla() | |
1012 plt.close() | |
1013 # Create figure | |
1014 offset = 0.2*(max(added_whole_coverage)) | |
1015 ylim = max(added_whole_coverage) + offset | |
1016 # Cumulative both strands | |
1017 plt.figure(figsize=(15,8)) | |
1018 plt.plot(added_whole_coverage,linewidth=2) | |
1019 plt.title(title) | |
1020 # Axes | |
1021 axes = plt.gca() | |
1022 axes.set_ylim([0,ylim]) | |
1023 # Plot Option | |
1024 plt.margins(0.1) | |
1025 plt.locator_params(axis = 'x', nbins = 10) | |
1026 plt.xticks(rotation=75) | |
1027 # Termini vertical dashed line display | |
1028 if graphZoom and isinstance(P_left, int): | |
1029 plt.axvline(x=pos_left, ymin=0, ymax=ylim, color='red', zorder=2, linestyle='dashed', linewidth=1) | |
1030 if graphZoom and isinstance(P_right, int): | |
1031 plt.axvline(x=pos_right, ymin=0, ymax=ylim, color='green', zorder=2, linestyle='dashed', linewidth=1) | |
1032 # Draw graph | |
1033 if draw: | |
1034 plt.savefig("%s_plot_WCov.png" % phagename, dpi=200) | |
1035 fig = plt.figure(1) | |
1036 return fig | |
1037 | |
1038 def GraphLogo(P_class, P_left, P_right, draw): | |
1039 """Produce logo.""" | |
1040 # Remove old plot | |
1041 plt.clf() | |
1042 plt.cla() | |
1043 plt.close() | |
1044 # Create figure | |
1045 plt.figure(figsize=(10,10)) | |
1046 #axes = plt.add_subplot(111) | |
1047 axes = plt.gca() | |
1048 axes.set_frame_on(False) | |
1049 axes.xaxis.set_visible(False) | |
1050 axes.yaxis.set_visible(False) | |
1051 # Cadre | |
1052 axes.add_artist(patches.Rectangle((0.1, 0.1), 0.8, 0.8, edgecolor = 'black', fill = False, linewidth = 15)) | |
1053 | |
1054 if P_class == "Headful (pac)": | |
1055 # Texte | |
1056 axes.text(0.17, 0.7, r"Headful (pac)", fontsize=50, fontweight='bold') | |
1057 # PAC (blue line) | |
1058 axes.axhline(y=0.35, xmin=0.2, xmax=0.8, color='blue', linewidth=15) | |
1059 # PAC (red line) | |
1060 axes.axvline(x=0.19, ymin=0.30, ymax=0.40, color='red', linewidth=10) | |
1061 axes.axvline(x=0.42, ymin=0.30, ymax=0.40, color='red', linewidth=10) | |
1062 axes.axvline(x=0.65, ymin=0.30, ymax=0.40, color='red', linewidth=10) | |
1063 # PAC (Arrow) | |
1064 axes.axvline(x=0.19, ymin=0.45, ymax=0.55, color='red', linewidth=15) | |
1065 axes.arrow(0.19, 0.55, 0.07, 0, color='red', linewidth=15, head_width=0.07, head_length=0.1) | |
1066 | |
1067 elif P_class == "COS (5')": | |
1068 # Texte | |
1069 axes.text(0.3, 0.7, r"COS (5')", fontsize=50, fontweight='bold') | |
1070 axes.add_artist(patches.Ellipse(xy=(0.5,0.4), width=0.5, height=0.35 , edgecolor = 'blue', fill=False, lw=15)) | |
1071 axes.add_artist(patches.Ellipse(xy=(0.5,0.4), width=0.58, height=0.43 , edgecolor = 'blue', fill=False, lw=15)) | |
1072 axes.add_artist(patches.Rectangle((0.4, 0.5), 0.20, 0.20, edgecolor = 'white', facecolor = 'white', fill = True)) | |
1073 axes.axhline(y=0.56, xmin=0.415, xmax=0.48, color='red', linewidth=16) | |
1074 axes.axhline(y=0.601, xmin=0.52, xmax=0.585, color='red', linewidth=16) | |
1075 | |
1076 elif P_class == "COS (3')": | |
1077 # Texte | |
1078 axes.text(0.3, 0.7, r"COS (3')", fontsize=50, fontweight='bold') | |
1079 axes.add_artist(patches.Ellipse(xy=(0.5,0.4), width=0.5, height=0.35 , edgecolor = 'blue', fill=False, lw=15)) | |
1080 axes.add_artist(patches.Ellipse(xy=(0.5,0.4), width=0.58, height=0.43 , edgecolor = 'blue', fill=False, lw=15)) | |
1081 axes.add_artist(patches.Rectangle((0.4, 0.5), 0.20, 0.20, edgecolor = 'white', facecolor = 'white', fill = True)) | |
1082 axes.axhline(y=0.601, xmin=0.415, xmax=0.48, color='red', linewidth=16) | |
1083 axes.axhline(y=0.56, xmin=0.52, xmax=0.585, color='red', linewidth=16) | |
1084 | |
1085 elif P_class == "COS": | |
1086 # Texte | |
1087 axes.text(0.4, 0.7, r"COS", fontsize=50, fontweight='bold') | |
1088 axes.add_artist(patches.Ellipse(xy=(0.5,0.4), width=0.5, height=0.35 , edgecolor = 'blue', fill=False, lw=15)) | |
1089 axes.add_artist(patches.Ellipse(xy=(0.5,0.4), width=0.58, height=0.43 , edgecolor = 'blue', fill=False, lw=15)) | |
1090 axes.add_artist(patches.Rectangle((0.4, 0.5), 0.20, 0.20, edgecolor = 'white', facecolor = 'white', fill = True)) | |
1091 | |
1092 elif P_class == "DTR (short)": | |
1093 # Texte | |
1094 axes.text(0.22, 0.7, r"DTR (short)", fontsize=50, fontweight='bold') | |
1095 | |
1096 verts = [(0.5, 0.5), (0.9, 0.4), (0.9, 0.3), (0.5,0.2)] | |
1097 codes = [Path.MOVETO, Path.CURVE4, Path.CURVE4, Path.CURVE4] | |
1098 path = Path(verts, codes) | |
1099 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'blue', lw=15) | |
1100 axes.add_patch(patch) | |
1101 | |
1102 verts = [(0.5, 0.2), (0.1, 0.30), (0.1, 0.45), (0.5,0.55)] | |
1103 codes = [Path.MOVETO, Path.CURVE4, Path.CURVE4, Path.CURVE4] | |
1104 path = Path(verts, codes) | |
1105 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'blue', lw=15) | |
1106 axes.add_patch(patch) | |
1107 | |
1108 verts = [(0.5, 0.55), (0.52, 0.545), (0, 0)] | |
1109 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1110 path = Path(verts, codes) | |
1111 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1112 axes.add_patch(patch) | |
1113 | |
1114 verts = [(0.56, 0.536), (0.58, 0.530), (0, 0)] | |
1115 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1116 path = Path(verts, codes) | |
1117 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1118 axes.add_patch(patch) | |
1119 | |
1120 verts = [(0.5, 0.50), (0.56, 0.480), (0, 0)] | |
1121 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1122 path = Path(verts, codes) | |
1123 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'white', lw=20) | |
1124 axes.add_patch(patch) | |
1125 | |
1126 verts = [(0.5, 0.50), (0.52, 0.495), (0, 0)] | |
1127 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1128 path = Path(verts, codes) | |
1129 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1130 axes.add_patch(patch) | |
1131 | |
1132 verts = [(0.56, 0.486), (0.58, 0.480), (0, 0)] | |
1133 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1134 path = Path(verts, codes) | |
1135 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1136 axes.add_patch(patch) | |
1137 | |
1138 elif P_class == "DTR (long)": | |
1139 # Texte | |
1140 axes.text(0.25, 0.7, r"DTR (long)", fontsize=50, fontweight='bold') | |
1141 verts = [(0.5, 0.5), (0.9, 0.4), (0.9, 0.3), (0.5,0.2)] | |
1142 codes = [Path.MOVETO, Path.CURVE4, Path.CURVE4, Path.CURVE4] | |
1143 path = Path(verts, codes) | |
1144 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'blue', lw=15) | |
1145 axes.add_patch(patch) | |
1146 | |
1147 verts = [(0.5, 0.2), (0.1, 0.30), (0.1, 0.45), (0.5,0.55)] | |
1148 codes = [Path.MOVETO, Path.CURVE4, Path.CURVE4, Path.CURVE4] | |
1149 path = Path(verts, codes) | |
1150 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'blue', lw=15) | |
1151 axes.add_patch(patch) | |
1152 | |
1153 verts = [(0.5, 0.55), (0.52, 0.545), (0, 0)] | |
1154 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1155 path = Path(verts, codes) | |
1156 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1157 axes.add_patch(patch) | |
1158 | |
1159 verts = [(0.56, 0.536), (0.58, 0.530), (0, 0)] | |
1160 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1161 path = Path(verts, codes) | |
1162 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1163 axes.add_patch(patch) | |
1164 | |
1165 verts = [(0.62, 0.521), (0.64, 0.516), (0, 0)] | |
1166 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1167 path = Path(verts, codes) | |
1168 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1169 axes.add_patch(patch) | |
1170 | |
1171 verts = [(0.68, 0.507), (0.70, 0.501), (0, 0)] | |
1172 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1173 path = Path(verts, codes) | |
1174 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1175 axes.add_patch(patch) | |
1176 | |
1177 verts = [(0.5, 0.50), (0.65, 0.460), (0, 0)] | |
1178 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1179 path = Path(verts, codes) | |
1180 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'white', lw=25) | |
1181 axes.add_patch(patch) | |
1182 | |
1183 verts = [(0.5, 0.50), (0.52, 0.495), (0, 0)] | |
1184 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1185 path = Path(verts, codes) | |
1186 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1187 axes.add_patch(patch) | |
1188 | |
1189 verts = [(0.56, 0.486), (0.58, 0.480), (0, 0)] | |
1190 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1191 path = Path(verts, codes) | |
1192 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1193 axes.add_patch(patch) | |
1194 | |
1195 verts = [(0.62, 0.471), (0.64, 0.465), (0, 0)] | |
1196 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1197 path = Path(verts, codes) | |
1198 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1199 axes.add_patch(patch) | |
1200 | |
1201 verts = [(0.68, 0.456), (0.70, 0.450), (0, 0)] | |
1202 codes = [Path.MOVETO, Path.LINETO, Path.CLOSEPOLY] | |
1203 path = Path(verts, codes) | |
1204 patch = patches.PathPatch(path, facecolor='none', edgecolor = 'red', lw=15) | |
1205 axes.add_patch(patch) | |
1206 | |
1207 elif P_class == "Mu-like": | |
1208 # Texte | |
1209 axes.text(0.33, 0.7, r"Mu-like", fontsize=50, fontweight='bold') | |
1210 axes.axhline(y=0.43, xmin=0.3, xmax=0.7, color='blue', linewidth=15) | |
1211 axes.axhline(y=0.47, xmin=0.3, xmax=0.7, color='blue', linewidth=15) | |
1212 axes.axhline(y=0.43, xmin=0.7, xmax=0.8, color='green', linewidth=15) | |
1213 axes.axhline(y=0.47, xmin=0.7, xmax=0.8, color='green', linewidth=15) | |
1214 axes.axhline(y=0.43, xmin=0.2, xmax=0.3, color='green', linewidth=15) | |
1215 axes.axhline(y=0.47, xmin=0.2, xmax=0.3, color='green', linewidth=15) | |
1216 | |
1217 elif P_left == "Random" and P_right == "Random": | |
1218 # Texte | |
1219 axes.text(0.25, 0.7, r"UNKNOWN", fontsize=50, fontweight='bold') | |
1220 axes.text(0.44, 0.3, r"?", fontsize=200, fontweight='bold') | |
1221 else: | |
1222 # Texte | |
1223 axes.text(0.4, 0.7, r"NEW", fontsize=50, fontweight='bold') | |
1224 axes.text(0.44, 0.3, r"!", fontsize=200, fontweight='bold') | |
1225 | |
1226 # Draw graph | |
1227 if draw: | |
1228 plt.savefig("%s_logo.png" % phagename, dpi=200) | |
1229 fig = plt.figure(1) | |
1230 return fig | |
1231 | |
1232 | |
1233 ### OUTPUT Result files | |
1234 def exportDataSplit(sequence, split): | |
1235 """Export sequence with split line length.""" | |
1236 seq = "" | |
1237 for i in xrange((len(sequence)/split)+1): | |
1238 seq += "".join(map(str,sequence[i*split:(i+1)*split])) + '\n' | |
1239 return seq | |
1240 | |
1241 def ExportStatistics(phagename, whole_coverage, paired_whole_coverage, termini_coverage, phage_plus_norm, phage_minus_norm, paired, test_run): | |
1242 """Export peaks statistics.""" | |
1243 if test_run: | |
1244 return | |
1245 export = pd.DataFrame() | |
1246 # ORGANIZE Column | |
1247 export["Position"] = list(phage_plus_norm.sort_values("Position")["Position"]) | |
1248 if paired != "": | |
1249 export["Coverage +"] = paired_whole_coverage[0] | |
1250 else: | |
1251 export["Coverage +"] = whole_coverage[0] | |
1252 export["SPC +"] = termini_coverage[0] | |
1253 export["T +"] = [format(x/100.0,'0.2') for x in list(phage_plus_norm.sort_values("Position")["SPC_std"])] | |
1254 export["T + (close)"] = [format(x/100.0,'0.2') for x in list(phage_plus_norm.sort_values("Position")["SPC"])] | |
1255 export["pvalue +"] = [format(x,'0.2e') for x in list(phage_plus_norm.sort_values("Position")["pval_gamma"])] | |
1256 export["padj +"] = [format(x,'0.2e') for x in list(phage_plus_norm.sort_values("Position")["pval_gamma_adj"])] | |
1257 if paired != "": | |
1258 export["Coverage -"] = whole_coverage[1] | |
1259 else: | |
1260 export["Coverage -"] = paired_whole_coverage[1] | |
1261 export["SPC -"] = termini_coverage[1] | |
1262 export["T -"] = [format(x/100.0,'0.2') for x in list(phage_minus_norm.sort_values("Position")["SPC_std"])] | |
1263 export["T - (close)"] = [format(x/100.0,'0.2') for x in list(phage_minus_norm.sort_values("Position")["SPC"])] | |
1264 export["pvalue -"] = [format(x,'0.2e') for x in list(phage_minus_norm.sort_values("Position")["pval_gamma"])] | |
1265 export["padj -"] = [format(x,'0.2e') for x in list(phage_minus_norm.sort_values("Position")["pval_gamma_adj"])] | |
1266 filout = open(phagename + "_statistics.csv", "w") | |
1267 filout.write(export.to_csv(index=False)) | |
1268 filout.close() | |
1269 return | |
1270 | |
1271 def ExportCohesiveSeq(phagename, ArtcohesiveSeq, P_seqcoh, test_run): | |
1272 """Export cohesive sequence of COS phages.""" | |
1273 if test_run: | |
1274 return | |
1275 if len(ArtcohesiveSeq) < 3 and len(P_seqcoh) < 3: | |
1276 return | |
1277 if len(ArtcohesiveSeq) < 20 and len(P_seqcoh) < 20: | |
1278 export_text = "cohesive sequence" | |
1279 filout = open(phagename + "_cohesive-sequence.fasta", "w") | |
1280 else: | |
1281 export_text = "direct terminal repeats sequence" | |
1282 filout = open(phagename + "_direct-term-repeats.fasta", "w") | |
1283 if P_seqcoh != '': | |
1284 filout.write(">" + phagename + " " + export_text + " (Analysis: Statistics)\n" + exportDataSplit(P_seqcoh, 60)) | |
1285 if ArtcohesiveSeq != '': | |
1286 filout.write(">" + phagename + " " + export_text + " (Analysis: Li)\n" + exportDataSplit(ArtcohesiveSeq, 60)) | |
1287 filout.close() | |
1288 return | |
1289 | |
1290 def ExportPhageSequence(phagename, P_left, P_right, refseq, P_orient, Redundant, Mu_like, P_class, P_seqcoh, test_run): | |
1291 """Export the phage sequence reorganized and completed if needed.""" | |
1292 if test_run: | |
1293 return | |
1294 seq_out = "" | |
1295 # Mu-like | |
1296 if Mu_like: | |
1297 if P_orient == "Forward": | |
1298 if P_right != "Random": | |
1299 if P_left > P_right: | |
1300 seq_out = refseq[P_right-1:P_left-1] | |
1301 else: | |
1302 seq_out = refseq[P_right-1:] + refseq[:P_left-1] | |
1303 else: | |
1304 seq_out = refseq[P_left-1:] + refseq[:P_left-1] | |
1305 elif P_orient == "Reverse": | |
1306 if P_left != "Random": | |
1307 if P_left > P_right: | |
1308 seq_out = reverseComplement(refseq[P_right-1:P_left-1]) | |
1309 else: | |
1310 seq_out = reverseComplement(refseq[P_right-1:] + reverseComplement(refseq[:P_left-1])) | |
1311 else: | |
1312 seq_out = reverseComplement(refseq[P_right-1:] + reverseComplement(refseq[:P_right-1]) ) | |
1313 | |
1314 # COS | |
1315 elif isinstance(P_left, int) and isinstance(P_right, int): | |
1316 # Cos or DTR | |
1317 if P_class == "COS (3')": | |
1318 if abs(P_left-P_right) > len(refseq)/2: | |
1319 seq_out = refseq[min(P_left,P_right)-1:max(P_left,P_right)] | |
1320 else: | |
1321 seq_out = refseq[max(P_left,P_right)-1:] + refseq[:min(P_left,P_right)] | |
1322 seq_out = seq_out + P_seqcoh | |
1323 else: | |
1324 # Genome | |
1325 if abs(P_left-P_right) > len(refseq)/2: | |
1326 seq_out = refseq[min(P_left,P_right)-1:max(P_left,P_right)] | |
1327 else: | |
1328 seq_out = refseq[max(P_left,P_right):] + refseq[:min(P_left,P_right)-1] | |
1329 # COS 5' | |
1330 if P_class == "COS (5')": | |
1331 seq_out = P_seqcoh + seq_out | |
1332 # DTR | |
1333 else: | |
1334 seq_out = P_seqcoh + seq_out + P_seqcoh | |
1335 # PAC | |
1336 elif isinstance(P_left, int) or isinstance(P_right, int): | |
1337 if P_orient == "Reverse": | |
1338 seq_out = reverseComplement(refseq[:P_right]) + reverseComplement(refseq[P_right:]) | |
1339 else: | |
1340 seq_out = refseq[P_left-1:] + refseq[:P_left-1] | |
1341 | |
1342 if seq_out != "": | |
1343 filout = open(phagename + "_sequence.fasta", "w") | |
1344 filout.write(">" + phagename + " sequence re-organized\n" + exportDataSplit(seq_out, 60)) | |
1345 filout.close() | |
1346 return | |
1347 | |
1348 def CreateReport(phagename, seed, added_whole_coverage, draw, Redundant, P_left, P_right, Permuted, P_orient, termini_coverage_norm_close, picMaxPlus_norm_close, picMaxMinus_norm_close, gen_len, tot_reads, P_seqcoh, phage_plus_norm, phage_minus_norm, ArtPackmode, termini, forward, reverse, ArtOrient, ArtcohesiveSeq, termini_coverage_close, picMaxPlus_close, picMaxMinus_close, picOUT_norm_forw, picOUT_norm_rev, picOUT_forw, picOUT_rev, lost_perc, ave_whole_cov, R1, R2, R3, host, host_len, host_whole_coverage, picMaxPlus_host, picMaxMinus_host, surrounding, drop_cov, paired, insert, phage_hybrid_coverage, host_hybrid_coverage, added_paired_whole_coverage, Mu_like, test_run, P_class, P_type, P_concat): | |
1349 """Produce a PDF report.""" | |
1350 doc = SimpleDocTemplate("%s_PhageTerm_report.pdf" % phagename, pagesize=letter, rightMargin=10,leftMargin=10, topMargin=5, bottomMargin=10) | |
1351 report=[] | |
1352 styles=getSampleStyleSheet() | |
1353 styles.add(ParagraphStyle(name='Justify', alignment=TA_JUSTIFY)) | |
1354 styles.add(ParagraphStyle(name='Center', alignment=TA_CENTER)) | |
1355 styles.add(ParagraphStyle(name='Right', alignment=TA_RIGHT)) | |
1356 styles.add(ParagraphStyle(name='Left', alignment=TA_LEFT)) | |
1357 | |
1358 ### GENERAL INFORMATION | |
1359 | |
1360 # TITLE | |
1361 ptext = '<b><font size=16>' + phagename + ' PhageTerm Analysis</font></b>' | |
1362 report.append(Paragraph(ptext, styles["Center"])) | |
1363 report.append(Spacer(1, 15)) | |
1364 | |
1365 ## ZOOMED TERMINI GRAPH AND LOGO RESULT | |
1366 | |
1367 # LOGO SLECTION | |
1368 | |
1369 imgdata = cStringIO.StringIO() | |
1370 fig_logo = GraphLogo(P_class, P_left, P_right, draw) | |
1371 fig_logo.savefig(imgdata, format='png') | |
1372 imgdata.seek(0) | |
1373 IMG = ImageReader(imgdata) | |
1374 IMAGE_2 = Image(IMG.fileName, width=150, height=150, kind='proportional') | |
1375 IMAGE_2.hAlign = 'CENTER' | |
1376 | |
1377 # Zoom on inter-termini seq | |
1378 if isinstance(P_left, int) and isinstance(P_right, int) and not Mu_like: | |
1379 Zoom_left = min(P_left-1000, P_right-1000) | |
1380 Zoom_right = max(P_left+1000, P_right+1000) | |
1381 imgdata = cStringIO.StringIO() | |
1382 if P_orient == "Reverse": | |
1383 zoom_pos_left = P_right-max(0,Zoom_left) | |
1384 zoom_pos_right = P_left-max(0,Zoom_left) | |
1385 else: | |
1386 zoom_pos_left = P_left-max(0,Zoom_left) | |
1387 zoom_pos_right = P_right-max(0,Zoom_left) | |
1388 | |
1389 figZ_whole = GraphWholeCov(added_whole_coverage[max(0,Zoom_left):min(gen_len,Zoom_right)], phagename + "-zoom", draw, P_left, P_right, zoom_pos_left, zoom_pos_right, 1, "Zoom Termini") | |
1390 figZ_whole.savefig(imgdata, format='png') | |
1391 imgdata.seek(0) | |
1392 IMG = ImageReader(imgdata) | |
1393 IMAGE = Image(IMG.fileName, width=275, height=340, kind='proportional') | |
1394 IMAGE.hAlign = 'CENTER' | |
1395 | |
1396 data = [[IMAGE, IMAGE_2]] | |
1397 t=Table(data, 1*[4*inch]+1*[3*inch], 1*[2*inch], hAlign='CENTER', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1398 report.append(t) | |
1399 report.append(Spacer(1, 5)) | |
1400 | |
1401 elif isinstance(P_left, int) and P_orient == "Forward": | |
1402 imgdata = cStringIO.StringIO() | |
1403 | |
1404 if Mu_like: | |
1405 figZL_whole = GraphWholeCov(phage_hybrid_coverage[0][max(0,P_left-1000):min(gen_len,P_left+1000)], phagename + "-zoom-left", draw, P_left, "", P_left-max(0,P_left-1000), 0, 1, "Zoom Termini") | |
1406 else: | |
1407 figZL_whole = GraphWholeCov(added_whole_coverage[max(0,P_left-1000):min(gen_len,P_left+1000)], phagename + "-zoom-left", draw, P_left, P_right, P_left-max(0,P_left-1000), 0, 1, "Zoom Termini") | |
1408 figZL_whole.savefig(imgdata, format='png') | |
1409 imgdata.seek(0) | |
1410 IMG = ImageReader(imgdata) | |
1411 IMAGE = Image(IMG.fileName, width=275, height=340, kind='proportional') | |
1412 IMAGE.hAlign = 'CENTER' | |
1413 | |
1414 data = [[IMAGE, IMAGE_2]] | |
1415 t=Table(data, 1*[5*inch]+1*[3*inch], 1*[2*inch], hAlign='CENTER', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1416 report.append(t) | |
1417 | |
1418 elif isinstance(P_right, int) and P_orient == "Reverse": | |
1419 imgdata = cStringIO.StringIO() | |
1420 | |
1421 if Mu_like: | |
1422 figZR_whole = GraphWholeCov(phage_hybrid_coverage[1][max(0,P_right-1000):min(gen_len,P_right+1000)], phagename + "-zoom-right", draw, "", P_right, 0, P_right-max(0,P_right-1000), 1, "Zoom Termini") | |
1423 else: | |
1424 figZR_whole = GraphWholeCov(added_whole_coverage[max(0,P_right-1000):min(gen_len,P_right+1000)], phagename + "-zoom-right", draw, P_left, P_right, 0, P_right-max(0,P_right-1000), 1, "Zoom Termini") | |
1425 figZR_whole.savefig(imgdata, format='png') | |
1426 imgdata.seek(0) | |
1427 IMG = ImageReader(imgdata) | |
1428 IMAGE = Image(IMG.fileName, width=275, height=340, kind='proportional') | |
1429 IMAGE.hAlign = 'CENTER' | |
1430 | |
1431 data = [[IMAGE, IMAGE_2]] | |
1432 t=Table(data, 1*[5*inch]+1*[3*inch], 1*[2*inch], hAlign='CENTER', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1433 report.append(t) | |
1434 report.append(Spacer(1, 5)) | |
1435 else: | |
1436 data = [[IMAGE_2]] | |
1437 t=Table(data, 1*[1.5*inch], 1*[2*inch], hAlign='CENTER', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1438 report.append(t) | |
1439 | |
1440 # Warning coverage message | |
1441 if ave_whole_cov < 50 and test_run == 0: | |
1442 ptextawc = "- - - - - - - - - WARNING: Coverage (" + str(int(ave_whole_cov)) + ") is under the limit of the software, Please consider results carrefuly. - - - - - - - - -" | |
1443 data = [[ptextawc]] | |
1444 t=Table(data, 1*[5*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('TEXTCOLOR',(0,0),(-1,-1),'RED'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1445 report.append(t) | |
1446 | |
1447 ## Statistics | |
1448 ptext = '<u><font size=14>PhageTerm Method</font></u>' | |
1449 report.append(Paragraph(ptext, styles["Left"])) | |
1450 report.append(Spacer(1, 10)) | |
1451 | |
1452 if Redundant: | |
1453 Ends = "Redundant" | |
1454 else: | |
1455 Ends = "Non Red." | |
1456 | |
1457 data = [["Ends", "Left (red)", "Right (green)", "Permuted", "Orientation", "Class", "Type"], [Ends, P_left, P_right, Permuted, P_orient, P_class, P_type]] | |
1458 t=Table(data, 7*[1.10*inch], 2*[0.25*inch], hAlign='CENTER', style=[('FONT',(0,0),(-1,-2),'Helvetica-Bold'), ('GRID',(0,0),(-1,-1),0.5,colors.grey), ('FONTSIZE',(0,0),(-1,-1),12), ('ALIGN',(0,0),(-1,-1),'CENTER'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1459 report.append(t) | |
1460 report.append(Spacer(1, 5)) | |
1461 | |
1462 # Seq cohesive or Direct terminal repeats | |
1463 if P_seqcoh != "": | |
1464 if len(P_seqcoh) < 20: | |
1465 ptext = '<i><font size=12>*Sequence cohesive: ' + P_seqcoh + '</font></i>' | |
1466 else: | |
1467 ptext = '<i><font size=12>*Direct Terminal Repeats: ' + str(len(P_seqcoh)) + ' bp</font></i>' | |
1468 report.append(Paragraph(ptext, styles["Left"])) | |
1469 | |
1470 # Multiple / Multiple (Nextera) | |
1471 if P_left == "Multiple" and P_right == "Multiple": | |
1472 ptext = '<i><font size=12>*This results could be due to a non-random fragmented sequence (e.g. Nextera)</font></i>' | |
1473 report.append(Paragraph(ptext, styles["Left"])) | |
1474 | |
1475 # Concatermer | |
1476 elif P_class[:7] == "Headful" and paired != "": | |
1477 ptext = '<i><font size=12>*concatemer estimation: ' + str(P_concat) + '</font></i>' | |
1478 report.append(Paragraph(ptext, styles["Left"])) | |
1479 | |
1480 # Mu hybrid | |
1481 elif Mu_like: | |
1482 if P_orient == "Forward": | |
1483 Mu_termini = P_left | |
1484 else: | |
1485 Mu_termini = P_right | |
1486 ptext = '<i><font size=12>*Mu estimated termini position with hybrid fragments: ' + str(Mu_termini) + '</font></i>' | |
1487 report.append(Paragraph(ptext, styles["Left"])) | |
1488 | |
1489 report.append(Spacer(1, 10)) | |
1490 | |
1491 # Results | |
1492 imgdata = cStringIO.StringIO() | |
1493 figP_norm = GraphCov(termini_coverage_norm_close, picMaxPlus_norm_close[:1], picMaxMinus_norm_close[:1], phagename + "-norm", 1, draw) | |
1494 figP_norm.savefig(imgdata, format='png') | |
1495 imgdata.seek(0) | |
1496 IMG = ImageReader(imgdata) | |
1497 IMAGE = Image(IMG.fileName, width=240, height=340, kind='proportional') | |
1498 IMAGE.hAlign = 'CENTER' | |
1499 | |
1500 data = [["Strand", "Location", "T", "pvalue", "T (Start. Pos. Cov. / Whole Cov.)"], ["+",phage_plus_norm["Position"][0],format(phage_plus_norm["SPC"][0]/100.0, '0.2f'),format(phage_plus_norm["pval_gamma_adj"][0], '0.2e'),IMAGE], ["",phage_plus_norm["Position"][1],format(phage_plus_norm["SPC"][1]/100.0, '0.2f'),format(phage_plus_norm["pval_gamma_adj"][1], '0.2e'),""], ["",phage_plus_norm["Position"][2],format(phage_plus_norm["SPC"][2]/100.0, '0.2f'),format(phage_plus_norm["pval_gamma_adj"][2], '0.2e'),""], ["",phage_plus_norm["Position"][3],format(phage_plus_norm["SPC"][3]/100.0, '0.2f'),format(phage_plus_norm["pval_gamma_adj"][3], '0.2e'),""], ["",phage_plus_norm["Position"][4],format(phage_plus_norm["SPC"][4]/100.0, '0.2f'),format(phage_plus_norm["pval_gamma_adj"][4], '0.2e'),""], ["-",phage_minus_norm["Position"][0],format(phage_minus_norm["SPC"][0]/100.0, '0.2f'),format(phage_minus_norm["pval_gamma_adj"][0], '0.2e'),""], ["",phage_minus_norm["Position"][1],format(phage_minus_norm["SPC"][1]/100.0, '0.2f'),format(phage_minus_norm["pval_gamma_adj"][1], '0.2e'),""], ["",phage_minus_norm["Position"][2],format(phage_minus_norm["SPC"][2]/100.0, '0.2f'),format(phage_minus_norm["pval_gamma_adj"][2], '0.2e'),""], ["",phage_minus_norm["Position"][3],format(phage_minus_norm["SPC"][3]/100.0, '0.2f'),format(phage_minus_norm["pval_gamma_adj"][3], '0.2e'),""], ["",phage_minus_norm["Position"][4],format(phage_minus_norm["SPC"][4]/100.0, '0.2f'),format(phage_minus_norm["pval_gamma_adj"][4], '0.2e'),""]] | |
1501 t=Table(data, 4*[1*inch]+1*[4*inch], 11*[0.25*inch], hAlign='CENTER', style=[('SPAN',(0,1),(0,5)), ('SPAN',(0,6),(0,10)), ('SPAN',(4,1),(4,10)), ('LINEABOVE',(0,1),(4,1),1.5,colors.black), ('LINEABOVE',(0,6),(4,6),1.5,colors.grey), ('FONT',(0,0),(-1,0),'Helvetica-Bold'), ('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),12), ('FONTSIZE',(0,1),(0,-1),16), ('ALIGN',(0,0),(-1,-1),'CENTER'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1502 | |
1503 report.append(t) | |
1504 report.append(Spacer(1, 5)) | |
1505 | |
1506 ## Li's Analysis | |
1507 ptext = '<u><font size=14>Li\'s Method</font></u>' | |
1508 report.append(Paragraph(ptext, styles["Left"])) | |
1509 report.append(Spacer(1, 10)) | |
1510 | |
1511 data = [["Packaging", "Termini", "Forward", "Reverse", "Orientation"], [ArtPackmode, termini, forward, reverse, ArtOrient]] | |
1512 t=Table(data, 2*[1*inch] + 2*[2*inch] + 1*[1*inch], 2*[0.25*inch], hAlign='CENTER', style=[('FONT',(0,0),(-1,-2),'Helvetica-Bold'), ('GRID',(0,0),(-1,-1),0.5,colors.grey), ('FONTSIZE',(0,0),(-1,-1),12), ('ALIGN',(0,0),(-1,-1),'CENTER'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1513 | |
1514 report.append(t) | |
1515 report.append(Spacer(1, 5)) | |
1516 | |
1517 # Seq cohesive or Direct terminal repeats | |
1518 if len(ArtcohesiveSeq) > 2: | |
1519 if len(ArtcohesiveSeq) < 20: | |
1520 ptext = '<i><font size=12>*Sequence cohesive: ' + ArtcohesiveSeq + '</font></i>' | |
1521 else: | |
1522 ptext = '<i><font size=12>*Direct Terminal Repeats: ' + str(len(ArtcohesiveSeq)) + ' bp</font></i>' | |
1523 report.append(Paragraph(ptext, styles["Left"])) | |
1524 report.append(Spacer(1, 10)) | |
1525 | |
1526 # Results | |
1527 imgdata = cStringIO.StringIO() | |
1528 figP = GraphCov(termini_coverage_close, picMaxPlus_close[:1], picMaxMinus_close[:1], phagename, 0, draw) | |
1529 figP.savefig(imgdata, format='png') | |
1530 imgdata.seek(0) | |
1531 IMG = ImageReader(imgdata) | |
1532 IMAGE = Image(IMG.fileName, width=240, height=340, kind='proportional') | |
1533 IMAGE.hAlign = 'CENTER' | |
1534 | |
1535 data = [["Strand", "Location", "SPC", "R", "SPC"],["+",picMaxPlus_close[0][1]+1,picMaxPlus_close[0][0],R2,IMAGE],["",picMaxPlus_close[1][1]+1,picMaxPlus_close[1][0],"-",""],["",picMaxPlus_close[2][1]+1,picMaxPlus_close[2][0],"-",""],["",picMaxPlus_close[3][1]+1,picMaxPlus_close[3][0],"-",""],["",picMaxPlus_close[4][1]+1,picMaxPlus_close[4][0],"-",""],["-",picMaxMinus_close[0][1]+1,picMaxMinus_close[0][0],R3,""], ["",picMaxMinus_close[1][1]+1,picMaxMinus_close[1][0],"-",""], ["",picMaxMinus_close[2][1]+1,picMaxMinus_close[2][0],"-",""], ["",picMaxMinus_close[3][1]+1,picMaxMinus_close[3][0],"-",""], ["",picMaxMinus_close[4][1]+1,picMaxMinus_close[4][0],"-",""]] | |
1536 t=Table(data, 4*[1*inch]+1*[4*inch], 11*[0.25*inch], hAlign='CENTER', style=[('SPAN',(0,1),(0,5)), ('SPAN',(0,6),(0,10)), ('SPAN',(4,1),(4,10)), ('LINEABOVE',(0,1),(4,1),1.5,colors.black), ('LINEABOVE',(0,6),(4,6),1.5,colors.grey), ('FONT',(0,0),(-1,0),'Helvetica-Bold'), ('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),12), ('FONTSIZE',(0,1),(0,-1),16), ('ALIGN',(0,0),(-1,-1),'CENTER'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1537 | |
1538 report.append(t) | |
1539 | |
1540 | |
1541 # NEW PAGE | |
1542 report.append(PageBreak()) | |
1543 | |
1544 # HOST RESULTS | |
1545 if host != "": | |
1546 # Host coverage | |
1547 ptext = '<u><font size=14>Host Analysis</font></u>' | |
1548 report.append(Paragraph(ptext, styles["Left"])) | |
1549 report.append(Spacer(1, 10)) | |
1550 | |
1551 ptext = '<i><font size=10></font>Reads that does not match on the phage genome are tested on the host genome. These reads could come from Phage transduction but also Host DNA contamination.</i>' | |
1552 report.append(Paragraph(ptext, styles["Justify"])) | |
1553 report.append(Spacer(1, 5)) | |
1554 | |
1555 data = [["Host Genome", str(host_len) + " bp"]] | |
1556 t=Table(data, 2*[2.25*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,0),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),12), ('ALIGN',(0,0),(-1,-1),'LEFT') ,('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1557 | |
1558 report.append(t) | |
1559 report.append(Spacer(1, 5)) | |
1560 | |
1561 imgdata = cStringIO.StringIO() | |
1562 figH = GraphCov(host_whole_coverage, picMaxPlus_host[:1], picMaxMinus_host[:1], "", 0, draw) | |
1563 figH.savefig(imgdata, format='png') | |
1564 imgdata.seek(0) | |
1565 IMG = ImageReader(imgdata) | |
1566 IMAGE = Image(IMG.fileName, width=240, height=340, kind='proportional') | |
1567 IMAGE.hAlign = 'CENTER' | |
1568 | |
1569 data = [["Strand", "Location", "Coverage", "-", "Whole Coverage"],["+",picMaxPlus_host[0][1]+1,picMaxPlus_host[0][0],"-",IMAGE],["",picMaxPlus_host[1][1]+1,picMaxPlus_host[1][0],"-",""],["",picMaxPlus_host[2][1]+1,picMaxPlus_host[2][0],"-",""],["",picMaxPlus_host[3][1]+1,picMaxPlus_host[3][0],"-",""],["",picMaxPlus_host[4][1]+1,picMaxPlus_host[4][0],"-",""],["-",picMaxMinus_host[0][1]+1,picMaxMinus_host[0][0],"-",""], ["",picMaxMinus_host[1][1]+1,picMaxMinus_host[1][0],"-",""], ["",picMaxMinus_host[2][1]+1,picMaxMinus_host[2][0],"-",""], ["",picMaxMinus_host[3][1]+1,picMaxMinus_host[3][0],"-",""], ["",picMaxMinus_host[4][1]+1,picMaxMinus_host[4][0],"-",""]] | |
1570 t=Table(data, 4*[1*inch]+1*[4*inch], 11*[0.25*inch], hAlign='CENTER', style=[('SPAN',(0,1),(0,5)), ('SPAN',(0,6),(0,10)), ('SPAN',(4,1),(4,10)), ('LINEABOVE',(0,1),(4,1),1.5,colors.black), ('LINEABOVE',(0,6),(4,6),1.5,colors.grey), ('FONT',(0,0),(-1,0),'Helvetica-Bold'), ('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),12), ('FONTSIZE',(0,1),(0,-1),16), ('ALIGN',(0,0),(-1,-1),'CENTER'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1571 | |
1572 report.append(t) | |
1573 report.append(Spacer(1, 10)) | |
1574 | |
1575 # Hybrid coverage | |
1576 ptext = '<u><font size=14>Hybrid Analysis</font></u>' | |
1577 report.append(Paragraph(ptext, styles["Left"])) | |
1578 report.append(Spacer(1, 10)) | |
1579 | |
1580 ptext = '<i><font size=10></font>Hybrid reads with one edge on the phage genome and the other edge on the host genome are detected. Phage Hybrid Coverages are used to detect Mu-like packaging mode. Host Hybrid Coverages could be used to detect Phage Transduction but also genome integration location of prophages.</i>' | |
1581 report.append(Paragraph(ptext, styles["Justify"])) | |
1582 report.append(Spacer(1, 5)) | |
1583 | |
1584 picMaxPlus_phage_hybrid, picMaxMinus_phage_hybrid, TopFreqH_phage_hybrid = picMax(phage_hybrid_coverage, 5) | |
1585 picMaxPlus_host_hybrid, picMaxMinus_host_hybrid, TopFreqH_host_hybrid = picMax(host_hybrid_coverage, 5) | |
1586 | |
1587 imgdataPH = cStringIO.StringIO() | |
1588 figPH = GraphCov(phage_hybrid_coverage, picMaxPlus_phage_hybrid[:1], picMaxMinus_phage_hybrid[:1], "", 0, draw, 1) | |
1589 figPH.savefig(imgdataPH, format='png') | |
1590 imgdataPH.seek(0) | |
1591 IMGPH = ImageReader(imgdataPH) | |
1592 IMAGEPH = Image(IMGPH.fileName, width=240, height=340, kind='proportional') | |
1593 IMAGEPH.hAlign = 'CENTER' | |
1594 | |
1595 | |
1596 imgdataHH = cStringIO.StringIO() | |
1597 figHH = GraphCov(host_hybrid_coverage, picMaxPlus_host_hybrid[:1], picMaxMinus_host_hybrid[:1], "", 0, draw, 1) | |
1598 figHH.savefig(imgdataHH, format='png') | |
1599 imgdataHH.seek(0) | |
1600 IMGHH = ImageReader(imgdataHH) | |
1601 IMAGEHH = Image(IMGHH.fileName, width=240, height=340, kind='proportional') | |
1602 IMAGEHH.hAlign = 'CENTER' | |
1603 | |
1604 data = [["Phage Hybrid Coverage", "Host Hybrid Coverage"],[IMAGEPH,IMAGEHH]] | |
1605 t=Table(data, 2*[4*inch], 1*[0.25*inch]+1*[2.5*inch], hAlign='CENTER', style=[('LINEABOVE',(0,1),(1,1),1.5,colors.black),('FONT',(0,0),(-1,-1),'Helvetica-Bold'),('FONTSIZE',(0,0),(-1,-1),12), ('ALIGN',(0,0),(-1,-1),'CENTER'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1606 | |
1607 report.append(t) | |
1608 report.append(Spacer(1, 10)) | |
1609 | |
1610 # NEW PAGE | |
1611 report.append(PageBreak()) | |
1612 | |
1613 | |
1614 # DETAILED RESULTS | |
1615 ptext = '<u><font size=14>Analysis Methodology</font></u>' | |
1616 report.append(Paragraph(ptext, styles["Left"])) | |
1617 report.append(Spacer(1, 10)) | |
1618 | |
1619 ptext = '<i><font size=10>PhageTerm software uses raw reads of a phage sequenced with a sequencing technology using random fragmentation and its genomic reference sequence to determine the termini position. The process starts with the alignment of NGS reads to the phage genome in order to calculate the starting position coverage (SPC), where a hit is given only to the position of the first base in a successfully aligned read (the alignment algorithm uses the lenght of the seed (default: 20) for mapping and does not accept gap or missmatch to speed up the process). Then the program apply 2 distinct scoring methods: i) a statistical approach based on the Gamma law; and ii) a method derived from LI and al. 2014 paper.</font></i>' | |
1620 report.append(Paragraph(ptext, styles["Justify"])) | |
1621 report.append(Spacer(1, 5)) | |
1622 | |
1623 | |
1624 # INFORMATION | |
1625 ptext = '<u><font size=12>General set-up and mapping informations</font></u>' | |
1626 report.append(Paragraph(ptext, styles["Justify"])) | |
1627 report.append(Spacer(1, 5)) | |
1628 | |
1629 imgdata = cStringIO.StringIO() | |
1630 | |
1631 if paired != "": | |
1632 figP_whole = GraphWholeCov(added_paired_whole_coverage, phagename, draw) | |
1633 else: | |
1634 figP_whole = GraphWholeCov(added_whole_coverage, phagename, draw) | |
1635 figP_whole.savefig(imgdata, format='png') | |
1636 imgdata.seek(0) | |
1637 IMG = ImageReader(imgdata) | |
1638 IMAGE = Image(IMG.fileName, width=275, height=340, kind='proportional') | |
1639 IMAGE.hAlign = 'CENTER' | |
1640 | |
1641 if host == "": | |
1642 host_analysis = "No" | |
1643 else: | |
1644 host_analysis = "Yes" | |
1645 | |
1646 if paired == "": | |
1647 sequencing_reads = "Single-ends Reads" | |
1648 else: | |
1649 sequencing_reads = "Paired-ends Reads" | |
1650 | |
1651 data = [["Phage Genome ", str(gen_len) + " bp",IMAGE], ["Sequencing Reads", int(tot_reads),""], ["Mapping Reads", str(int(100 - lost_perc)) + " %",""], ["OPTIONS","",""], ["Mapping Seed",seed,""], ["Surrounding",surrounding,""], ["Host Analysis ", host_analysis,""], ["","",""]] | |
1652 t=Table(data, 1*[2.25*inch]+1*[1.80*inch]+1*[4*inch], 8*[0.25*inch], hAlign='LEFT', style=[('SPAN',(2,0),(2,-1)), ('FONT',(0,0),(0,2),'Helvetica-Bold'), ('FONT',(0,3),(0,3),'Helvetica-Oblique'), ('FONT',(0,4),(1,-1),'Helvetica-Oblique'), ('FONT',(2,0),(2,0),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),12), ('ALIGN',(0,0),(-1,-2),'LEFT'), ('ALIGN',(2,0),(2,-1),'CENTER') ,('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1653 | |
1654 report.append(t) | |
1655 report.append(Spacer(1, 5)) | |
1656 | |
1657 | |
1658 # Img highest peaks of each side even if no significative | |
1659 ptext = '<u><font size=12>Highest peak of each side coverage graphics</font></u>' | |
1660 report.append(Paragraph(ptext, styles["Justify"])) | |
1661 report.append(Spacer(1, 5)) | |
1662 | |
1663 imgdata = cStringIO.StringIO() | |
1664 | |
1665 if Mu_like and isinstance(P_left, int): | |
1666 figHL_whole = GraphWholeCov(phage_hybrid_coverage[0][max(0,P_left-1000):min(gen_len,P_left+1000)], phagename + "-zoom-left", draw, P_left, "", P_left-max(0,P_left-1000), 0, 1, "Zoom Termini") | |
1667 else: | |
1668 P_left = phage_plus_norm["Position"][0] | |
1669 figHL_whole = GraphWholeCov(added_whole_coverage[max(0,P_left-1000):min(gen_len,P_left+1000)], phagename + "-zoom-left", draw, P_left, "", P_left-max(0,P_left-1000), 0, 1, "Zoom Termini") | |
1670 figHL_whole.savefig(imgdata, format='png') | |
1671 imgdata.seek(0) | |
1672 IMG = ImageReader(imgdata) | |
1673 IMAGE = Image(IMG.fileName, width=275, height=340, kind='proportional') | |
1674 IMAGE.hAlign = 'CENTER' | |
1675 | |
1676 imgdata2 = cStringIO.StringIO() | |
1677 | |
1678 if Mu_like and isinstance(P_right, int): | |
1679 figHR_whole = GraphWholeCov(phage_hybrid_coverage[1][max(0,P_right-1000):min(gen_len,P_right+1000)], phagename + "-zoom-right", draw, "", P_right, 0, P_right-max(0,P_right-1000), 1, "Zoom Termini") | |
1680 else: | |
1681 P_right = phage_minus_norm["Position"][0] | |
1682 figHR_whole = GraphWholeCov(added_whole_coverage[max(0,P_right-1000):min(gen_len,P_right+1000)], phagename + "-zoom-right", draw, "", P_right, 0, P_right-max(0,P_right-1000), 1, "Zoom Termini") | |
1683 figHR_whole.savefig(imgdata2, format='png') | |
1684 imgdata2.seek(0) | |
1685 IMG2 = ImageReader(imgdata2) | |
1686 IMAGE2 = Image(IMG2.fileName, width=275, height=340, kind='proportional') | |
1687 IMAGE2.hAlign = 'CENTER' | |
1688 | |
1689 if Mu_like: | |
1690 data = [["Hybrid Coverage Zoom (Left)", "Hybrid Coverage Zoom (Right)"],[IMAGE,IMAGE2]] | |
1691 else: | |
1692 data = [["Whole Coverage Zoom (Left)", "Whole Coverage Zoom (Right)"],[IMAGE,IMAGE2]] | |
1693 t=Table(data, 2*[4*inch], 1*[0.25*inch]+1*[2*inch], hAlign='CENTER', style=[('LINEABOVE',(0,1),(1,1),1.5,colors.black),('FONT',(0,0),(-1,-1),'Helvetica-Bold'),('FONTSIZE',(0,0),(-1,-1),12), ('ALIGN',(0,0),(-1,-1),'CENTER'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1694 report.append(t) | |
1695 | |
1696 # Controls | |
1697 ptext = '<u><font size=12>General controls information</font></u>' | |
1698 report.append(Paragraph(ptext, styles["Justify"])) | |
1699 report.append(Spacer(1, 5)) | |
1700 | |
1701 if ave_whole_cov < 50: | |
1702 ptextawc = "WARNING: Under the limit of the software (50)" | |
1703 elif ave_whole_cov < 200: | |
1704 ptextawc = "WARNING: Low (<200), Li's method could not be reliable\n" | |
1705 else: | |
1706 ptextawc = "OK" | |
1707 | |
1708 data = [["Whole genome coverage", int(ave_whole_cov), ptextawc]] | |
1709 t=Table(data, 1*[3.5*inch]+1*[1*inch]+1*[3.5*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1710 report.append(t) | |
1711 | |
1712 drop_perc = len([i for i in added_whole_coverage if i < (ave_whole_cov/2)]) / float(len(added_whole_coverage)) | |
1713 if drop_perc < 1: | |
1714 ptextdp = "OK" | |
1715 else: | |
1716 ptextdp = "Check your genome reference" | |
1717 | |
1718 data = [["Weak genome coverage", "%.1f %%" %drop_perc, ptextdp]] | |
1719 t=Table(data, 1*[3.5*inch]+1*[1*inch]+1*[4*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1720 report.append(t) | |
1721 | |
1722 if paired != "": | |
1723 insert_mean = sum(insert)/len(insert) | |
1724 data = [["Insert mean size", insert_mean, "Mean insert estimated from paired-end reads"]] | |
1725 t=Table(data, 1*[3.5*inch]+1*[1*inch]+1*[4*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1726 report.append(t) | |
1727 | |
1728 if lost_perc > 25: | |
1729 ptextlp = "Warning : high percentage of reads lost" | |
1730 else: | |
1731 ptextlp = "OK" | |
1732 | |
1733 data = [["Reads lost during alignment", "%.1f %%" %lost_perc, ptextlp]] | |
1734 t=Table(data, 1*[3.5*inch]+1*[1*inch]+1*[4*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1735 report.append(t) | |
1736 report.append(Spacer(1, 5)) | |
1737 | |
1738 # DETAILED SCORE | |
1739 ptext = '<b><font size=14>i) PhageTerm method</font></b>' | |
1740 report.append(Paragraph(ptext, styles["Left"])) | |
1741 report.append(Spacer(1, 10)) | |
1742 | |
1743 ptext = '<i><font size=10>Reads are mapped on the reference to determine the starting position coverage (SPC) as well as the coverage (COV) in each orientation. These values are then used to compute the variable T = SPC / COV. The average value of T at positions along the genome that are not termini is expected to be 1/F, where F is the average fragment size. For the termini that depends of the packaging mode. Cos Phages: no reads should start before the terminus and therefore X=1. DTR phages: for N phages present in the sample, there should be N fragments that start at the terminus and N fragments that cover the edge of the repeat on the other side of the genome as a results T is expected to be 0.5. Pac phages: for N phages in the sample, there should be N/C fragments starting at the pac site, where C is the number of phage genome copies per concatemer. In the same sample N fragments should cover the pac site position, T is expected to be (N/C)/(N+N/C) = 1/(1+C). To assess whether the number of reads starting at a given position along the genome can be considered a significant outlier, PhageTerm first segments the genome according to coverage using a regression tree. A gamma distribution is then fitted to SPC for each segment and an adjusted p-value is computed for each position. Finally if several significant peaks are detected within a small sequence window (default: 20bp), their T values are merged.</font></i>' | |
1744 report.append(Paragraph(ptext, styles["Justify"])) | |
1745 report.append(Spacer(1, 5)) | |
1746 | |
1747 # surrounding | |
1748 if surrounding > 0: | |
1749 data = [["Nearby Termini (Forward / Reverse)", str(len(picOUT_norm_forw)-1) + " / " + str(len(picOUT_norm_rev)-1), "Peaks localized %s bases around the maximum" %surrounding]] | |
1750 t=Table(data, 1*[3.5*inch]+1*[1*inch]+1*[4*inch], 1*[0.25*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1751 report.append(t) | |
1752 report.append(Spacer(1, 5)) | |
1753 | |
1754 report.append(Spacer(1, 20)) | |
1755 | |
1756 # Li's Method | |
1757 ptext = '<b><font size=14>ii) Li\'s method</font></b>' | |
1758 report.append(Paragraph(ptext, styles["Left"])) | |
1759 report.append(Spacer(1, 10)) | |
1760 | |
1761 ptext = '<i><font size=10>The second approach is based on the calculation and interpretation of three specific ratios R1, R2 and R3 as suggested in a previous publication from Li et al. 2014. The first ratio, is calculated as follow: the highest starting frequency found on either the forward or reverse strands is divided by the average starting frequency, R1 = (highest frequency/average frequency). Li’s et al. have proposed three possible interpretation of the R1 ratio. First, if R1 < 30, the phage genome does not have any termini, and is either circular or completely permuted and terminally redundant. The second interpretation for R1 is when 30 ≤ R1 ≥ 100, suggesting the presence of preferred termini with terminal redundancy and apparition of partially circular permutations. At last if R1 > 100 that is an indication that at least one fixed termini is present with terminase recognizing a specific site. The two other ratios are R2 and R3 and the calculation is done in a similar manner. R2 is calculated using the highest two frequencies (T1-F and T2-F) found on the forward strand and R3 is calculated using the highest two frequencies (T1-R and T2-R) found on the reverse strand. To calculate these two ratios, we divide the highest frequency by the second highest frequency T2. So R2 = (T1-F / T2-F) and R3 = (T1-R / T2-R). These two ratios are used to analyze termini characteristics on each strand taken individually. Li et al. suggested two possible interpretations for R2 and R3 ratios combine to R1. When R1 < 30 and R2 < 3, we either have no obvious termini on the forward strand, or we have multiple preferred termini on the forward strand, if 30 ≤ R1 ≤ 100. If R2 > 3, it is suggested that there is an obvious unique termini on the forward strand. The same reasoning is applicable for the result of R3. Combining the results for ratios found with this approach, it is possible to make the first prediction for the viral packaging mode of the analyzed phage. A unique obvious termini present at both ends (both R2 and R3 > 3) reveals the presence of a COS mode of packaging. The headful mode of packaging PAC is concluded when we have a single obvious termini only on one strand. A whole coverage around 500X is needed for this method to be reliable.</font></i>' | |
1762 report.append(Paragraph(ptext, styles["Justify"])) | |
1763 report.append(Spacer(1, 5)) | |
1764 | |
1765 if surrounding > 0: | |
1766 data = [["Nearby Termini (Forward / Reverse)", str(len(picOUT_forw)-1) + " / " + str(len(picOUT_rev)-1), "Peaks localized %s bases around the maximum" %surrounding]] | |
1767 t=Table(data, 1*[3.5*inch]+1*[1*inch]+1*[3.5*inch], 1*[0.25*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1768 report.append(t) | |
1769 report.append(Spacer(1, 5)) | |
1770 | |
1771 if R1 > 100: | |
1772 ptextR1 = "At least one fixed termini is present with terminase recognizing a specific site." | |
1773 elif R1 > 30: | |
1774 ptextR1 = "Presence of preferred termini with terminal redundancy and apparition of partially circular permutations." | |
1775 else: | |
1776 ptextR1 = "Phage genome does not have any termini, and is either circular or completely permuted and terminally redundant." | |
1777 | |
1778 data = [["R1 - highest freq./average freq.", int(R1), Paragraph(ptextR1, styles["Justify"])]] | |
1779 t=Table(data, 1*[3.5*inch]+1*[1*inch]+1*[3.5*inch], 1*[0.25*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1780 report.append(t) | |
1781 report.append(Spacer(1, 5)) | |
1782 | |
1783 if R2 < 3 and R1 < 30: | |
1784 ptextR2 = "No obvious termini on the forward strand." | |
1785 elif R2 < 3 : | |
1786 ptextR2 = "Multiple preferred termini on the forward strand." | |
1787 elif R2 >= 3: | |
1788 ptextR2 = "Unique termini on the forward strand." | |
1789 | |
1790 data = [["R2 Forw - highest freq./second freq.", int(R2), Paragraph(ptextR2, styles["Justify"])]] | |
1791 t=Table(data, 1*[3.5*inch]+1*[1*inch]+1*[3.5*inch], 1*[0.25*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1792 report.append(t) | |
1793 report.append(Spacer(1, 5)) | |
1794 | |
1795 if R3 < 3 and R1 < 30: | |
1796 ptextR3 = "No obvious termini on the reverse strand." | |
1797 elif R3 < 3 : | |
1798 ptextR3 = "Multiple preferred termini on the reverse strand." | |
1799 elif R3 >= 3: | |
1800 ptextR3 = "Unique termini on the reverse strand." | |
1801 | |
1802 data = [["R3 Rev - highest freq./second freq.", int(R3), Paragraph(ptextR3, styles["Justify"])]] | |
1803 t=Table(data, 1*[3.5*inch]+1*[1*inch]+1*[3.5*inch], 1*[0.25*inch], hAlign='LEFT', style=[('FONT',(0,0),(0,-1),'Helvetica-Bold'), ('FONTSIZE',(0,0),(-1,-1),10), ('ALIGN',(0,0),(-1,-1),'LEFT'),('VALIGN',(0,0),(-1,-1),'MIDDLE')]) | |
1804 report.append(t) | |
1805 | |
1806 # CREDITS and TIME | |
1807 ptext = '<font size=8>%s</font>' % "Please cite: Sci. Rep. DOI 10.1038/s41598-017-07910-5" | |
1808 report.append(Paragraph(ptext, styles["Center"])) | |
1809 ptext = '<font size=8>%s</font>' % "Garneau, Depardieu, Fortier, Bikard and Monot. PhageTerm: Determining Bacteriophage Termini and Packaging using NGS data." | |
1810 report.append(Paragraph(ptext, styles["Center"])) | |
1811 ptext = '<font size=8>Report generated : %s</font>' % time.ctime() | |
1812 report.append(Paragraph(ptext, styles["Center"])) | |
1813 | |
1814 # CREATE PDF | |
1815 doc.build(report) | |
1816 return | |
1817 | |
1818 | |
1819 | |
1820 | |
1821 | |
1822 | |
1823 | |
1824 | |
1825 | |
1826 |