annotate _modules/utilities.py @ 0:69e8f12c8b31 draft

"planemo upload"
author bioit_sciensano
date Fri, 11 Mar 2022 15:06:20 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
1 ## @file utilities.py
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
2 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
3 # Gather here utility methods for phageterm. Used in both CPU and GPU version.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
4 #from string import maketrans
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
5 import re
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
6 import random
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
7 import sys
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
8
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
9 import numpy as np
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
10 import datetime
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
11
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
12 if sys.version_info < (3,):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
13 import string
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
14 TRANSTAB = string.maketrans("ACGTN", "TGCAN")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
15 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
16 TRANSTAB = str.maketrans("ACGTN", "TGCAN")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
17
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
18 def checkReportTitle(report_title):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
19 """Normalise report title (take out any special char)"""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
20 default_title="Analysis_"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
21 right_now=datetime.datetime.now()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
22 default_title+=str(right_now.month)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
23 default_title+=str(right_now.day)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
24 default_title+="_"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
25 default_title+=str(right_now.hour)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
26 default_title+=str(right_now.minute)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
27 titleNorm = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
28 charok = list(range(48,58)) + list(range(65,91)) + list(range(97,123)) + [45,95]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
29 for char in report_title:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
30 if ord(char) in charok:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
31 titleNorm += char
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
32 if len(titleNorm) > 1:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
33 return titleNorm[:20]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
34 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
35 return default
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
36
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
37 ### SEQUENCE manipulation function
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
38 def changeCase(seq):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
39 """Change lower case to UPPER CASE for a sequence string."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
40 return seq.upper()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
41
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
42
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
43 def reverseComplement(seq, transtab=str.maketrans('ATGCN', 'TACGN')):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
44 """Reverse Complement a sequence."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
45 return changeCase(seq).translate(transtab)[::-1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
46
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
47 def longest_common_substring(read, refseq):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
48 """Longest common substring between two strings."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
49 m = [[0] * (1 + len(refseq)) for i in range(1 + len(read))]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
50 longest, x_longest = 0, 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
51 for x in range(1, 1 + len(read)):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
52 for y in range(1, 1 + len(refseq)):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
53 if read[x - 1] == refseq[y - 1]:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
54 m[x][y] = m[x - 1][y - 1] + 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
55 if m[x][y] > longest:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
56 longest = m[x][y]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
57 x_longest = x
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
58 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
59 m[x][y] = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
60 return read[x_longest - longest: x_longest]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
61
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
62 def hybridCoverage(read, sequence, hybrid_coverage, start, end):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
63 """Return hybrid coverage."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
64 aligned_part_only = longest_common_substring(read, sequence[start:end])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
65 for i in range(start, min(len(sequence),start+len(aligned_part_only))):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
66 hybrid_coverage[i]+=1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
67 return hybrid_coverage
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
68
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
69 ## Determines if readPart maps against Sequence.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
70 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
71 # @param readPart A part of a read (seed characters usually)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
72 # @param sequence (a contig)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
73 # It choses randomly a mapping position amongst all mappings found.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
74 # It returns 2 numbers: the start and stop position of the chosen mapping location.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
75 def applyCoverage(readPart, sequence):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
76 """Return a random match of a read onto the sequence. """
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
77 position = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
78 for pos in re.finditer(readPart,sequence):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
79 position.append(pos)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
80 if len(position) > 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
81 match = random.choice(position)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
82 return match.start(), match.end()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
83 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
84 return -1, -1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
85
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
86 def correctEdge(coverage, edge):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
87 """Correction of the Edge coverage. """
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
88 correctCov = np.array([len(coverage[0])*[0], len(coverage[0])*[0]])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
89 End = len(coverage[0])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
90 covSta = range(edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
91 covEnd = range(End-edge,End)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
92 for i in range(len(coverage)):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
93 for j in range(len(coverage[i])):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
94 correctCov[i][j] = coverage[i][j]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
95 for k in covSta:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
96 correctCov[i][k+edge] += coverage[i][k+End-edge]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
97 for l in covEnd:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
98 correctCov[i][l-edge] += coverage[i][l-End+edge]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
99 return correctCov
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
100
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
101 # utility class for storing results of decisionProcess function
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
102 class DecisionProcessOutput:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
103 def __init__(self, Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
104 P_orient, P_left, P_right, Mu_like):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
105 pass
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
106