Mercurial > repos > bioit_sciensano > phagetermvirome
comparison _modules/common_readsCoverage_processing.py @ 0:69e8f12c8b31 draft
"planemo upload"
| author | bioit_sciensano |
|---|---|
| date | Fri, 11 Mar 2022 15:06:20 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:69e8f12c8b31 |
|---|---|
| 1 ## @file common_readsCoverage_processing.py | |
| 2 # | |
| 3 # VL: here I gathered functions that are common to both GPU and mono/multi CPU versions. | |
| 4 # These functions are called after the mapping is done and all the counters are filled from mapping output results. | |
| 5 from __future__ import print_function | |
| 6 | |
| 7 from time import gmtime, strftime | |
| 8 import heapq | |
| 9 import itertools | |
| 10 | |
| 11 import numpy as np | |
| 12 import pandas as pd | |
| 13 # Statistics | |
| 14 from scipy import stats | |
| 15 from statsmodels.sandbox.stats.multicomp import multipletests | |
| 16 from sklearn.tree import DecisionTreeRegressor #TODO VL: fix issue on importing that | |
| 17 | |
| 18 from _modules.utilities import checkReportTitle | |
| 19 from _modules.SeqStats import SeqStats | |
| 20 | |
| 21 import os | |
| 22 | |
| 23 | |
| 24 k_no_match_for_contig=1 | |
| 25 | |
| 26 def wholeCov(whole_coverage, gen_len): | |
| 27 """Calculate the coverage for whole read alignments and its average""" | |
| 28 if gen_len == 0: | |
| 29 return whole_coverage, 1 | |
| 30 total_cov = sum(whole_coverage[0]) + sum(whole_coverage[1]) | |
| 31 ave_whole_cov = float(total_cov) / (2 * float(gen_len)) | |
| 32 added_whole_coverage = [x + y for x, y in zip(whole_coverage[0], whole_coverage[1])] | |
| 33 return added_whole_coverage, ave_whole_cov | |
| 34 | |
| 35 def testwholeCov(added_whole_coverage, ave_whole_cov, test): | |
| 36 """Return information about whole coverage.""" | |
| 37 if test: | |
| 38 return "" | |
| 39 if ave_whole_cov < 50: | |
| 40 print("\nWARNING: average coverage is under the limit of the software (50)") | |
| 41 elif ave_whole_cov < 200: | |
| 42 print("\nWARNING: average coverage is low (<200), Li's method is presumably unreliable\n") | |
| 43 drop_cov = [] | |
| 44 start_pos = last_pos = count_pos = 0 | |
| 45 for pos in range(len(added_whole_coverage)): | |
| 46 if added_whole_coverage[pos] < (ave_whole_cov / 1.5): | |
| 47 if pos == last_pos+1: | |
| 48 count_pos += 1 | |
| 49 last_pos = pos | |
| 50 else: | |
| 51 if count_pos > 100: | |
| 52 drop_cov.append( (start_pos,last_pos+1) ) | |
| 53 last_pos = start_pos = pos | |
| 54 count_pos = 0 | |
| 55 last_pos = pos | |
| 56 return drop_cov | |
| 57 | |
| 58 def maxPaired(paired_whole_coverage, whole_coverage): | |
| 59 """Max paired coverage using whole coverage, counter edge effect with paired-ends.""" | |
| 60 pwc = paired_whole_coverage[:] | |
| 61 wc = whole_coverage[:] | |
| 62 for i in range(len(pwc)): | |
| 63 for j in range(len(pwc[i])): | |
| 64 if pwc[i][j] < wc[i][j]: | |
| 65 pwc[i][j] = wc[i][j] | |
| 66 return pwc | |
| 67 | |
| 68 def replaceNormMean(norm_cov): | |
| 69 """Replace the values not normalised due to covLimit by mean.""" | |
| 70 nc_sum = nc_count = 0 | |
| 71 for nc in norm_cov: | |
| 72 if nc > 0: | |
| 73 nc_sum += nc | |
| 74 nc_count += 1 | |
| 75 if nc_count == 0: | |
| 76 mean_nc = 0 | |
| 77 else: | |
| 78 mean_nc = nc_sum / float(nc_count) | |
| 79 for i in range(len(norm_cov)): | |
| 80 if norm_cov[i] == 0: | |
| 81 norm_cov[i] = mean_nc | |
| 82 return norm_cov, mean_nc | |
| 83 | |
| 84 def normCov(termini_coverage, whole_coverage, covLimit, edge): | |
| 85 """Return the termini_coverage normalised by the whole coverage (% of coverage due to first base).""" | |
| 86 normalised_coverage = [len(termini_coverage[0])*[0], len(termini_coverage[0])*[0]] | |
| 87 termini_len = len(termini_coverage[0]) | |
| 88 mean_nc = [1,1] | |
| 89 for i in range(len(termini_coverage)): | |
| 90 for j in range(len(termini_coverage[i])): | |
| 91 if j < edge or j > termini_len-edge: | |
| 92 continue | |
| 93 if whole_coverage[i][j] >= covLimit: | |
| 94 if float(whole_coverage[i][j]) != 0: | |
| 95 normalised_coverage[i][j] = float(termini_coverage[i][j]) / float(whole_coverage[i][j]) | |
| 96 else: | |
| 97 normalised_coverage[i][j] = 0 | |
| 98 else: | |
| 99 normalised_coverage[i][j] = 0 | |
| 100 normalised_coverage[i], mean_nc[i] = replaceNormMean(normalised_coverage[i]) | |
| 101 return normalised_coverage, mean_nc | |
| 102 | |
| 103 def RemoveEdge(tableau, edge): | |
| 104 return tableau[edge:-edge] | |
| 105 | |
| 106 def usedReads(coverage, tot_reads): | |
| 107 """Retrieve the number of reads after alignment and calculate the percentage of reads lost.""" | |
| 108 used_reads = sum(coverage[0]) + sum(coverage[1]) | |
| 109 lost_reads = tot_reads - used_reads | |
| 110 lost_perc = (float(tot_reads) - float(used_reads))/float(tot_reads) * 100 | |
| 111 return used_reads, lost_reads, lost_perc | |
| 112 | |
| 113 ### PEAK functions | |
| 114 def picMax(coverage, nbr_pic): | |
| 115 """COORDINATES (coverage value, position) of the nbr_pic largest coverage value.""" | |
| 116 if coverage == [[],[]] or coverage == []: | |
| 117 return "", "", "" | |
| 118 picMaxPlus = heapq.nlargest(nbr_pic, zip(coverage[0], itertools.count())) | |
| 119 picMaxMinus = heapq.nlargest(nbr_pic, zip(coverage[1], itertools.count())) | |
| 120 TopFreqH = max(max(np.array(list(zip(*picMaxPlus))[0])), max(np.array(list(zip(*picMaxMinus))[0]))) | |
| 121 return picMaxPlus, picMaxMinus, TopFreqH | |
| 122 | |
| 123 def RemoveClosePicMax(picMax, gen_len, nbr_base): | |
| 124 """Remove peaks that are too close of the maximum (nbr_base around)""" | |
| 125 if nbr_base == 0: | |
| 126 return picMax[1:], [picMax[0]] | |
| 127 picMaxRC = picMax[:] | |
| 128 posMax = picMaxRC[0][1] | |
| 129 LimSup = posMax + nbr_base | |
| 130 LimInf = posMax - nbr_base | |
| 131 if LimSup < gen_len and LimInf >= 0: | |
| 132 PosOut = list(range(LimInf,LimSup)) | |
| 133 elif LimSup >= gen_len: | |
| 134 TurnSup = LimSup - gen_len | |
| 135 PosOut = list(range(posMax,gen_len))+list(range(0,TurnSup)) + list(range(LimInf,posMax)) | |
| 136 elif LimInf < 0: | |
| 137 TurnInf = gen_len + LimInf | |
| 138 PosOut = list(range(0,posMax))+list(range(TurnInf,gen_len)) + list(range(posMax,LimSup)) | |
| 139 picMaxOK = [] | |
| 140 picOUT = [] | |
| 141 for peaks in picMaxRC: | |
| 142 if peaks[1] not in PosOut: | |
| 143 picMaxOK.append(peaks) | |
| 144 else: | |
| 145 picOUT.append(peaks) | |
| 146 return picMaxOK, picOUT | |
| 147 | |
| 148 def addClosePic(picList, picClose, norm = 0): | |
| 149 """Add coverage value of close peaks to the top peak. Remove picClose in picList if exist.""" | |
| 150 if norm: | |
| 151 if picClose[0][0] >= 0.5: | |
| 152 return picList, [picClose[0]] | |
| 153 picListOK = picList[:] | |
| 154 cov_add = 0 | |
| 155 for cov in picClose: | |
| 156 cov_add += cov[0] | |
| 157 picListOK[cov[1]] = 0.01 | |
| 158 picListOK[picClose[0][1]] = cov_add | |
| 159 return picListOK, picClose | |
| 160 | |
| 161 def remove_pics(arr,n): | |
| 162 '''Removes the n highest values from the array''' | |
| 163 arr=np.array(arr) | |
| 164 pic_pos=arr.argsort()[-n:][::-1] | |
| 165 arr2=np.delete(arr,pic_pos) | |
| 166 return arr2 | |
| 167 | |
| 168 def gamma(X): | |
| 169 """Apply a gamma distribution.""" | |
| 170 X = np.array(X, dtype=np.int64) | |
| 171 v = remove_pics(X, 3) | |
| 172 | |
| 173 dist_max = float(max(v)) | |
| 174 if dist_max == 0: | |
| 175 return np.array([1.00] * len(X)) | |
| 176 | |
| 177 actual = np.bincount(v) | |
| 178 fit_alpha, fit_loc, fit_beta = stats.gamma.fit(v) | |
| 179 expected = stats.gamma.pdf(np.arange(0, dist_max + 1, 1), fit_alpha, loc=fit_loc, scale=fit_beta) * sum(actual) | |
| 180 | |
| 181 return stats.gamma.pdf(X, fit_alpha, loc=fit_loc, scale=fit_beta) | |
| 182 | |
| 183 | |
| 184 # STATISTICS | |
| 185 def test_pics_decision_tree(whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close): | |
| 186 """Fits a gamma distribution using a decision tree.""" | |
| 187 L = len(whole_coverage[0]) | |
| 188 res = pd.DataFrame({"Position": np.array(range(L)) + 1, "termini_plus": termini_coverage[0], | |
| 189 "SPC_norm_plus": termini_coverage_norm[0], "SPC_norm_minus": termini_coverage_norm[1], | |
| 190 "SPC_norm_plus_close": termini_coverage_norm_close[0], | |
| 191 "SPC_norm_minus_close": termini_coverage_norm_close[1], "termini_minus": termini_coverage[1], | |
| 192 "cov_plus": whole_coverage[0], "cov_minus": whole_coverage[1]}) | |
| 193 | |
| 194 res["cov"] = res["cov_plus"].values + res["cov_minus"].values | |
| 195 | |
| 196 res["R_plus"] = list(map(float, termini_coverage[0])) // np.mean(termini_coverage[0]) | |
| 197 res["R_minus"] = list(map(float, termini_coverage[1])) // np.mean(termini_coverage[1]) | |
| 198 | |
| 199 regr = DecisionTreeRegressor(max_depth=3, min_samples_leaf=100) | |
| 200 X = np.arange(L) | |
| 201 X = X[:, np.newaxis] | |
| 202 y = res["cov"].values | |
| 203 regr.fit(X, y) | |
| 204 | |
| 205 # Predict | |
| 206 y_1 = regr.predict(X) | |
| 207 res["covnode"] = y_1 | |
| 208 covnodes = np.unique(y_1) | |
| 209 thres = np.mean(whole_coverage[0]) / 2 | |
| 210 covnodes = [n for n in covnodes if n > thres] | |
| 211 | |
| 212 for node in covnodes: | |
| 213 X = res[res["covnode"] == node]["termini_plus"].values | |
| 214 res.loc[res["covnode"] == node, "pval_plus"] = gamma(X) | |
| 215 X = res[res["covnode"] == node]["termini_minus"].values | |
| 216 res.loc[res["covnode"] == node, "pval_minus"] = gamma(X) | |
| 217 | |
| 218 res.loc[res.pval_plus > 1, 'pval_plus'] = 1.00 | |
| 219 res.loc[res.pval_minus > 1, 'pval_minus'] = 1.00 | |
| 220 res = res.fillna(1.00) | |
| 221 | |
| 222 res['pval_plus_adj'] = multipletests(res["pval_plus"].values, alpha=0.01, method="bonferroni")[1] | |
| 223 res['pval_minus_adj'] = multipletests(res["pval_minus"].values, alpha=0.01, method="bonferroni")[1] | |
| 224 | |
| 225 res = res.fillna(1.00) | |
| 226 | |
| 227 res_plus = pd.DataFrame( | |
| 228 {"Position": res['Position'], "SPC_std": res['SPC_norm_plus'] * 100, "SPC": res['SPC_norm_plus_close'] * 100, | |
| 229 "pval_gamma": res['pval_plus'], "pval_gamma_adj": res['pval_plus_adj']}) | |
| 230 res_minus = pd.DataFrame( | |
| 231 {"Position": res['Position'], "SPC_std": res['SPC_norm_minus'] * 100, "SPC": res['SPC_norm_minus_close'] * 100, | |
| 232 "pval_gamma": res['pval_minus'], "pval_gamma_adj": res['pval_minus_adj']}) | |
| 233 | |
| 234 res_plus.sort_values("SPC", ascending=False, inplace=True) | |
| 235 res_minus.sort_values("SPC", ascending=False, inplace=True) | |
| 236 | |
| 237 res_plus.reset_index(drop=True, inplace=True) | |
| 238 res_minus.reset_index(drop=True, inplace=True) | |
| 239 | |
| 240 return res, res_plus, res_minus | |
| 241 | |
| 242 ### SCORING functions | |
| 243 # Li's methodology | |
| 244 def ratioR1(TopFreqH, used_reads, gen_len): | |
| 245 """Calculate the ratio H/A (R1) = highest frequency/average frequency. For Li's methodology.""" | |
| 246 AveFreq = (float(used_reads)/float(gen_len)/2) | |
| 247 if AveFreq == 0: | |
| 248 return 0, 0 | |
| 249 R1 = float(TopFreqH)/float(AveFreq) | |
| 250 return R1, AveFreq | |
| 251 | |
| 252 def ratioR(picMax): | |
| 253 """Calculate the T1/T2 = Top 1st frequency/Second higher frequency. For Li's methodology.""" | |
| 254 T1 = picMax[0][0] | |
| 255 T2 = max(1,picMax[1][0]) | |
| 256 R = float(T1)/float(T2) | |
| 257 return round(R) | |
| 258 | |
| 259 | |
| 260 def packMode(R1, R2, R3): | |
| 261 """Make the prognosis about the phage packaging mode and termini type. For Li's methodology.""" | |
| 262 packmode = "OTHER" | |
| 263 termini = "" | |
| 264 forward = "" | |
| 265 reverse = "" | |
| 266 | |
| 267 if R1 < 30: | |
| 268 termini = "Absence" | |
| 269 if R2 < 3: | |
| 270 forward = "No Obvious Termini" | |
| 271 if R3 < 3: | |
| 272 reverse = "No Obvious Termini" | |
| 273 elif R1 > 100: | |
| 274 termini = "Fixed" | |
| 275 if R2 < 3: | |
| 276 forward = "Multiple-Pref. Term." | |
| 277 if R3 < 3: | |
| 278 reverse = "Multiple-Pref. Term." | |
| 279 else: | |
| 280 termini = "Preferred" | |
| 281 if R2 < 3: | |
| 282 forward = "Multiple-Pref. Term." | |
| 283 if R3 < 3: | |
| 284 reverse = "Multiple-Pref. Term." | |
| 285 | |
| 286 if R2 >= 3: | |
| 287 forward = "Obvious Termini" | |
| 288 if R3 >= 3: | |
| 289 reverse = "Obvious Termini" | |
| 290 | |
| 291 if R2 >= 3 and R3 >= 3: | |
| 292 packmode = "COS" | |
| 293 if R2 >= 3 and R3 < 3: | |
| 294 packmode = "PAC" | |
| 295 if R2 < 3 and R3 >= 3: | |
| 296 packmode = "PAC" | |
| 297 return packmode, termini, forward, reverse | |
| 298 | |
| 299 ### PHAGE Information | |
| 300 def orientation(picMaxPlus, picMaxMinus): | |
| 301 """Return phage termini orientation.""" | |
| 302 if not picMaxPlus and not picMaxMinus: | |
| 303 return "NA" | |
| 304 if picMaxPlus and not picMaxMinus: | |
| 305 return "Forward" | |
| 306 if not picMaxPlus and picMaxMinus: | |
| 307 return "Reverse" | |
| 308 if picMaxPlus and picMaxMinus: | |
| 309 if picMaxPlus[0][0] > picMaxMinus[0][0]: | |
| 310 return "Forward" | |
| 311 elif picMaxMinus[0][0] > picMaxPlus[0][0]: | |
| 312 return "Reverse" | |
| 313 elif picMaxMinus[0][0] == picMaxPlus[0][0]: | |
| 314 return "NA" | |
| 315 | |
| 316 | |
| 317 def typeCOS(PosPlus, PosMinus, nbr_lim): | |
| 318 """ Return type of COS sequence.""" | |
| 319 if PosPlus < PosMinus and abs(PosPlus-PosMinus) < nbr_lim: | |
| 320 return "COS (5')", "Lambda" | |
| 321 else: | |
| 322 return "COS (3')", "HK97" | |
| 323 | |
| 324 def sequenceCohesive(Packmode, refseq, picMaxPlus, picMaxMinus, nbr_lim): | |
| 325 """Return cohesive sequence for COS phages.""" | |
| 326 if Packmode != 'COS': | |
| 327 return '', Packmode | |
| 328 PosPlus = picMaxPlus[0][1] | |
| 329 PosMinus = picMaxMinus[0][1] | |
| 330 | |
| 331 SC_class, SC_type = typeCOS(PosPlus, PosMinus, nbr_lim) | |
| 332 | |
| 333 if SC_class == "COS (5')": | |
| 334 if abs(PosMinus - PosPlus) < nbr_lim: | |
| 335 seqcoh = refseq[min(PosPlus, PosMinus):max(PosPlus, PosMinus) + 1] | |
| 336 return seqcoh, Packmode | |
| 337 else: | |
| 338 seqcoh = refseq[max(PosPlus, PosMinus) + 1:] + refseq[:min(PosPlus, PosMinus)] | |
| 339 return seqcoh, Packmode | |
| 340 | |
| 341 elif SC_class == "COS (3')": | |
| 342 if abs(PosMinus - PosPlus) < nbr_lim: | |
| 343 seqcoh = refseq[min(PosPlus, PosMinus) + 1:max(PosPlus, PosMinus)] | |
| 344 return seqcoh, Packmode | |
| 345 else: | |
| 346 seqcoh = refseq[max(PosPlus, PosMinus) + 1:] + refseq[:min(PosPlus, PosMinus)] | |
| 347 return seqcoh, Packmode | |
| 348 else: | |
| 349 return '', Packmode | |
| 350 | |
| 351 def selectSignificant(table, pvalue, limit): | |
| 352 """Return significant peaks over a limit""" | |
| 353 table_pvalue = table.loc[lambda df: df.pval_gamma_adj < pvalue, :] | |
| 354 table_pvalue_limit = table_pvalue.loc[lambda df: df.SPC > limit, :] | |
| 355 table_pvalue_limit.reset_index(drop=True, inplace=True) | |
| 356 return table_pvalue_limit | |
| 357 | |
| 358 def testMu(paired, list_hybrid, gen_len, used_reads, seed, insert, phage_hybrid_coverage, Mu_threshold, hostseq): | |
| 359 """Return Mu if enough hybrid reads compared to theory.""" | |
| 360 if hostseq == "": | |
| 361 return 0, -1, -1, "" | |
| 362 if paired != "" and len(insert) != 0: | |
| 363 insert_mean = sum(insert) / len(insert) | |
| 364 else: | |
| 365 insert_mean = max(100, seed+10) | |
| 366 Mu_limit = ((insert_mean - seed) / float(gen_len)) * used_reads/2 | |
| 367 test = 0 | |
| 368 Mu_term_plus = "Random" | |
| 369 Mu_term_minus = "Random" | |
| 370 picMaxPlus_Mu, picMaxMinus_Mu, TopFreqH_phage_hybrid = picMax(phage_hybrid_coverage, 1) | |
| 371 picMaxPlus_Mu = picMaxPlus_Mu[0][1] | |
| 372 picMaxMinus_Mu = picMaxMinus_Mu[0][1] | |
| 373 | |
| 374 # Orientation | |
| 375 if list_hybrid[0] > list_hybrid[1]: | |
| 376 P_orient = "Forward" | |
| 377 elif list_hybrid[1] > list_hybrid[0]: | |
| 378 P_orient = "Reverse" | |
| 379 else: | |
| 380 P_orient = "" | |
| 381 | |
| 382 # Termini | |
| 383 if list_hybrid[0] > ( Mu_limit * Mu_threshold ): | |
| 384 test = 1 | |
| 385 pos_to_check = range(picMaxPlus_Mu+1,gen_len) + range(0,100) | |
| 386 for pos in pos_to_check: | |
| 387 if phage_hybrid_coverage[0][pos] >= max(1,phage_hybrid_coverage[0][picMaxPlus_Mu]/4): | |
| 388 Mu_term_plus = pos | |
| 389 picMaxPlus_Mu = pos | |
| 390 else: | |
| 391 Mu_term_plus = pos | |
| 392 break | |
| 393 # Reverse | |
| 394 if list_hybrid[1] > ( Mu_limit * Mu_threshold ): | |
| 395 test = 1 | |
| 396 pos_to_check = range(0,picMaxMinus_Mu)[::-1] + range(gen_len-100,gen_len)[::-1] | |
| 397 for pos in pos_to_check: | |
| 398 if phage_hybrid_coverage[1][pos] >= max(1,phage_hybrid_coverage[1][picMaxMinus_Mu]/4): | |
| 399 Mu_term_minus = pos | |
| 400 picMaxMinus_Mu = pos | |
| 401 else: | |
| 402 Mu_term_minus = pos | |
| 403 break | |
| 404 return test, Mu_term_plus, Mu_term_minus, P_orient | |
| 405 | |
| 406 ### DECISION Process | |
| 407 def decisionProcess(plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid, | |
| 408 used_reads, seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq): | |
| 409 """ .""" | |
| 410 P_orient = "NA" | |
| 411 P_seqcoh = "" | |
| 412 P_concat = "" | |
| 413 P_type = "-" | |
| 414 Mu_like = 0 | |
| 415 P_left = "Random" | |
| 416 P_right = "Random" | |
| 417 # 2 peaks sig. | |
| 418 if not plus_significant.empty and not minus_significant.empty: | |
| 419 # Multiple | |
| 420 if (len(plus_significant["SPC"]) > 1 or len(minus_significant["SPC"]) > 1): | |
| 421 if not (plus_significant["SPC"][0] > limit_fixed or minus_significant["SPC"][0] > limit_fixed): | |
| 422 Redundant = 1 | |
| 423 P_left = "Multiple" | |
| 424 P_right = "Multiple" | |
| 425 Permuted = "Yes" | |
| 426 P_class = "-" | |
| 427 P_type = "-" | |
| 428 return Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like | |
| 429 | |
| 430 dist_peak = abs(plus_significant['Position'][0] - minus_significant['Position'][0]) | |
| 431 dist_peak_over = abs(abs(plus_significant['Position'][0] - minus_significant['Position'][0]) - gen_len) | |
| 432 P_left = plus_significant['Position'][0] | |
| 433 P_right = minus_significant['Position'][0] | |
| 434 # COS | |
| 435 if (dist_peak <= 2) or (dist_peak_over <= 2): | |
| 436 Redundant = 0 | |
| 437 Permuted = "No" | |
| 438 P_class = "COS" | |
| 439 P_type = "-" | |
| 440 elif (dist_peak < 20 and dist_peak > 2) or (dist_peak_over < 20 and dist_peak_over > 2): | |
| 441 Redundant = 0 | |
| 442 Permuted = "No" | |
| 443 P_class, P_type = typeCOS(plus_significant["Position"][0], minus_significant["Position"][0], gen_len / 2) | |
| 444 P_seqcoh, packstat = sequenceCohesive("COS", refseq, [ | |
| 445 ((plus_significant["SPC"][0]), (plus_significant["Position"][0]) - 1)], [((minus_significant["SPC"][0]), | |
| 446 ( | |
| 447 minus_significant["Position"][ | |
| 448 0]) - 1)], gen_len / 2) | |
| 449 # DTR | |
| 450 elif (dist_peak <= 1000) or (dist_peak_over <= 1000): | |
| 451 Redundant = 1 | |
| 452 Permuted = "No" | |
| 453 P_class = "DTR (short)" | |
| 454 P_type = "T7" | |
| 455 P_seqcoh, packstat = sequenceCohesive("COS", refseq, [ | |
| 456 ((plus_significant["SPC"][0]), (plus_significant["Position"][0]) - 1)], [((minus_significant["SPC"][0]), | |
| 457 ( | |
| 458 minus_significant["Position"][ | |
| 459 0]) - 1)], gen_len / 2) | |
| 460 elif (dist_peak <= 0.1 * gen_len) or (dist_peak_over <= 0.1 * gen_len): | |
| 461 Redundant = 1 | |
| 462 Permuted = "No" | |
| 463 P_class = "DTR (long)" | |
| 464 P_type = "T5" | |
| 465 P_seqcoh, packstat = sequenceCohesive("COS", refseq, [ | |
| 466 ((plus_significant["SPC"][0]), (plus_significant["Position"][0]) - 1)], [((minus_significant["SPC"][0]), | |
| 467 ( | |
| 468 minus_significant["Position"][ | |
| 469 0]) - 1)], gen_len / 2) | |
| 470 else: | |
| 471 Redundant = 1 | |
| 472 Permuted = "No" | |
| 473 P_class = "-" | |
| 474 P_type = "-" | |
| 475 # 1 peak sig. | |
| 476 elif not plus_significant.empty and minus_significant.empty or plus_significant.empty and not minus_significant.empty: | |
| 477 Redundant = 1 | |
| 478 Permuted = "Yes" | |
| 479 P_class = "Headful (pac)" | |
| 480 P_type = "P1" | |
| 481 if paired != "": | |
| 482 if R1 == 0 or len(insert) == 0: | |
| 483 P_concat = 1 | |
| 484 else: | |
| 485 P_concat = round((sum(insert) / len(insert)) / R1) - 1 | |
| 486 if not plus_significant.empty: | |
| 487 P_left = plus_significant['Position'][0] | |
| 488 P_right = "Distributed" | |
| 489 P_orient = "Forward" | |
| 490 else: | |
| 491 P_left = "Distributed" | |
| 492 P_right = minus_significant['Position'][0] | |
| 493 P_orient = "Reverse" | |
| 494 # 0 peak sig. | |
| 495 elif plus_significant.empty and minus_significant.empty: | |
| 496 Mu_like, Mu_term_plus, Mu_term_minus, P_orient = testMu(paired, list_hybrid, gen_len, used_reads, seed, insert, | |
| 497 phage_hybrid_coverage, Mu_threshold, hostseq) | |
| 498 if Mu_like: | |
| 499 Redundant = 0 | |
| 500 Permuted = "No" | |
| 501 P_class = "Mu-like" | |
| 502 P_type = "Mu" | |
| 503 P_left = Mu_term_plus | |
| 504 P_right = Mu_term_minus | |
| 505 else: | |
| 506 Redundant = 1 | |
| 507 Permuted = "Yes" | |
| 508 P_class = "-" | |
| 509 P_type = "-" | |
| 510 | |
| 511 return Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like | |
| 512 | |
| 513 # Processes coverage values for a sequence. | |
| 514 def processCovValuesForSeq(refseq,hostseq,refseq_name,refseq_liste,seed,analysis_name,tot_reads,results_pos,test_run, paired,edge,host,test, surrounding,limit_preferred,limit_fixed,Mu_threshold,\ | |
| 515 termini_coverage,whole_coverage,paired_whole_coverage,phage_hybrid_coverage,host_hybrid_coverage, host_whole_coverage,insert,list_hybrid,reads_tested,DR,DR_path=None): | |
| 516 | |
| 517 print("\n\nFinished calculating coverage values, the remainder should be completed rapidly\n", | |
| 518 strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) | |
| 519 | |
| 520 # WHOLE Coverage : Average, Maximum and Minimum | |
| 521 added_whole_coverage, ave_whole_cov = wholeCov(whole_coverage, len(refseq)) | |
| 522 added_paired_whole_coverage, ave_paired_whole_cov = wholeCov(paired_whole_coverage, len(refseq)) | |
| 523 added_host_whole_coverage, ave_host_whole_cov = wholeCov(host_whole_coverage, len(hostseq)) | |
| 524 | |
| 525 drop_cov = testwholeCov(added_whole_coverage, ave_whole_cov, test_run) | |
| 526 | |
| 527 # NORM pic by whole coverage (1 base) | |
| 528 if paired != "": | |
| 529 #paired_whole_coverage_test = maxPaired(paired_whole_coverage, whole_coverage) | |
| 530 termini_coverage_norm, mean_nc = normCov(termini_coverage, paired_whole_coverage, max(10, ave_whole_cov / 1.5), | |
| 531 edge) | |
| 532 else: | |
| 533 termini_coverage_norm, mean_nc = normCov(termini_coverage, whole_coverage, max(10, ave_whole_cov / 1.5), edge) | |
| 534 | |
| 535 # REMOVE edge | |
| 536 termini_coverage[0] = RemoveEdge(termini_coverage[0], edge) | |
| 537 termini_coverage[1] = RemoveEdge(termini_coverage[1], edge) | |
| 538 termini_coverage_norm[0] = RemoveEdge(termini_coverage_norm[0], edge) | |
| 539 termini_coverage_norm[1] = RemoveEdge(termini_coverage_norm[1], edge) | |
| 540 whole_coverage[0] = RemoveEdge(whole_coverage[0], edge) | |
| 541 whole_coverage[1] = RemoveEdge(whole_coverage[1], edge) | |
| 542 paired_whole_coverage[0] = RemoveEdge(paired_whole_coverage[0], edge) | |
| 543 paired_whole_coverage[1] = RemoveEdge(paired_whole_coverage[1], edge) | |
| 544 added_whole_coverage = RemoveEdge(added_whole_coverage, edge) | |
| 545 added_paired_whole_coverage = RemoveEdge(added_paired_whole_coverage, edge) | |
| 546 added_host_whole_coverage = RemoveEdge(added_host_whole_coverage, edge) | |
| 547 phage_hybrid_coverage[0] = RemoveEdge(phage_hybrid_coverage[0], edge) | |
| 548 phage_hybrid_coverage[1] = RemoveEdge(phage_hybrid_coverage[1], edge) | |
| 549 host_whole_coverage[0] = RemoveEdge(host_whole_coverage[0], edge) | |
| 550 host_whole_coverage[1] = RemoveEdge(host_whole_coverage[1], edge) | |
| 551 host_hybrid_coverage[0] = RemoveEdge(host_hybrid_coverage[0], edge) | |
| 552 host_hybrid_coverage[1] = RemoveEdge(host_hybrid_coverage[1], edge) | |
| 553 refseq = RemoveEdge(refseq, edge) | |
| 554 if host != "": | |
| 555 hostseq = RemoveEdge(hostseq, edge) | |
| 556 gen_len = len(refseq) | |
| 557 host_len = len(hostseq) | |
| 558 if test == "DL": | |
| 559 gen_len = 100000 | |
| 560 | |
| 561 # READS Total, Used and Lost | |
| 562 used_reads, lost_reads, lost_perc = usedReads(termini_coverage, reads_tested) | |
| 563 | |
| 564 # PIC Max | |
| 565 picMaxPlus, picMaxMinus, TopFreqH = picMax(termini_coverage, 5) | |
| 566 picMaxPlus_norm, picMaxMinus_norm, TopFreqH_norm = picMax(termini_coverage_norm, 5) | |
| 567 picMaxPlus_host, picMaxMinus_host, TopFreqH_host = picMax(host_whole_coverage, 5) | |
| 568 | |
| 569 ### ANALYSIS | |
| 570 | |
| 571 ## Close Peaks | |
| 572 picMaxPlus, picOUT_forw = RemoveClosePicMax(picMaxPlus, gen_len, surrounding) | |
| 573 picMaxMinus, picOUT_rev = RemoveClosePicMax(picMaxMinus, gen_len, surrounding) | |
| 574 picMaxPlus_norm, picOUT_norm_forw = RemoveClosePicMax(picMaxPlus_norm, gen_len, surrounding) | |
| 575 picMaxMinus_norm, picOUT_norm_rev = RemoveClosePicMax(picMaxMinus_norm, gen_len, surrounding) | |
| 576 | |
| 577 termini_coverage_close = termini_coverage[:] | |
| 578 termini_coverage_close[0], picOUT_forw = addClosePic(termini_coverage[0], picOUT_forw) | |
| 579 termini_coverage_close[1], picOUT_rev = addClosePic(termini_coverage[1], picOUT_rev) | |
| 580 | |
| 581 termini_coverage_norm_close = termini_coverage_norm[:] | |
| 582 termini_coverage_norm_close[0], picOUT_norm_forw = addClosePic(termini_coverage_norm[0], picOUT_norm_forw, 1) | |
| 583 termini_coverage_norm_close[1], picOUT_norm_rev = addClosePic(termini_coverage_norm[1], picOUT_norm_rev, 1) | |
| 584 | |
| 585 ## Statistical Analysis | |
| 586 picMaxPlus_norm_close, picMaxMinus_norm_close, TopFreqH_norm = picMax(termini_coverage_norm_close, 5) | |
| 587 phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(paired_whole_coverage, termini_coverage, | |
| 588 termini_coverage_norm, | |
| 589 termini_coverage_norm_close) | |
| 590 # VL: comment that since the 2 different conditions lead to the execution of the same piece of code... | |
| 591 # if paired != "": | |
| 592 # phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(paired_whole_coverage, termini_coverage, | |
| 593 # termini_coverage_norm, | |
| 594 # termini_coverage_norm_close) | |
| 595 # else: | |
| 596 # phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(whole_coverage, termini_coverage, | |
| 597 # termini_coverage_norm, | |
| 598 # termini_coverage_norm_close) | |
| 599 | |
| 600 | |
| 601 ## LI Analysis | |
| 602 picMaxPlus_close, picMaxMinus_close, TopFreqH = picMax(termini_coverage_close, 5) | |
| 603 | |
| 604 R1, AveFreq = ratioR1(TopFreqH, used_reads, gen_len) | |
| 605 R2 = ratioR(picMaxPlus_close) | |
| 606 R3 = ratioR(picMaxMinus_close) | |
| 607 | |
| 608 ArtPackmode, termini, forward, reverse = packMode(R1, R2, R3) | |
| 609 ArtOrient = orientation(picMaxPlus_close, picMaxMinus_close) | |
| 610 ArtcohesiveSeq, ArtPackmode = sequenceCohesive(ArtPackmode, refseq, picMaxPlus_close, picMaxMinus_close, | |
| 611 gen_len / 2) | |
| 612 | |
| 613 ### DECISION Process | |
| 614 | |
| 615 # PEAKS Significativity | |
| 616 plus_significant = selectSignificant(phage_plus_norm, 1.0 / gen_len, limit_preferred) | |
| 617 minus_significant = selectSignificant(phage_minus_norm, 1.0 / gen_len, limit_preferred) | |
| 618 | |
| 619 # DECISION | |
| 620 Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like = decisionProcess( | |
| 621 plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid, used_reads, | |
| 622 seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq) | |
| 623 | |
| 624 ### Results | |
| 625 if len(refseq_liste) != 1: | |
| 626 if P_class == "-": | |
| 627 if P_left == "Random" and P_right == "Random": | |
| 628 P_class = "UNKNOWN" | |
| 629 else: | |
| 630 P_class = "NEW" | |
| 631 DR[P_class][checkReportTitle(refseq_name[results_pos])] = {"analysis_name": analysis_name, "seed": seed, | |
| 632 "added_whole_coverage": added_whole_coverage, | |
| 633 "Redundant": Redundant, "P_left": P_left, | |
| 634 "P_right": P_right, "Permuted": Permuted, | |
| 635 "P_orient": P_orient, | |
| 636 "termini_coverage_norm_close": termini_coverage_norm_close, | |
| 637 "picMaxPlus_norm_close": picMaxPlus_norm_close, | |
| 638 "picMaxMinus_norm_close": picMaxMinus_norm_close, | |
| 639 "gen_len": gen_len, "tot_reads": tot_reads, | |
| 640 "P_seqcoh": P_seqcoh, | |
| 641 "phage_plus_norm": phage_plus_norm, | |
| 642 "phage_minus_norm": phage_minus_norm, | |
| 643 "ArtPackmode": ArtPackmode, "termini": termini, | |
| 644 "forward": forward, "reverse": reverse, | |
| 645 "ArtOrient": ArtOrient, | |
| 646 "ArtcohesiveSeq": ArtcohesiveSeq, | |
| 647 "termini_coverage_close": termini_coverage_close, | |
| 648 "picMaxPlus_close": picMaxPlus_close, | |
| 649 "picMaxMinus_close": picMaxMinus_close, | |
| 650 "picOUT_norm_forw": picOUT_norm_forw, | |
| 651 "picOUT_norm_rev": picOUT_norm_rev, | |
| 652 "picOUT_forw": picOUT_forw, | |
| 653 "picOUT_rev": picOUT_rev, "lost_perc": lost_perc, | |
| 654 "ave_whole_cov": ave_whole_cov, "R1": R1, "R2": R2, | |
| 655 "R3": R3, "host": host, "host_len": host_len, | |
| 656 "host_whole_coverage": host_whole_coverage, | |
| 657 "picMaxPlus_host": picMaxPlus_host, | |
| 658 "picMaxMinus_host": picMaxMinus_host, | |
| 659 "surrounding": surrounding, "drop_cov": drop_cov, | |
| 660 "paired": paired, "insert": insert, | |
| 661 "phage_hybrid_coverage": phage_hybrid_coverage, | |
| 662 "host_hybrid_coverage": host_hybrid_coverage, | |
| 663 "added_paired_whole_coverage": added_paired_whole_coverage, | |
| 664 "Mu_like": Mu_like, "test_run": test_run, | |
| 665 "P_class": P_class, "P_type": P_type, | |
| 666 "P_concat": P_concat, | |
| 667 "idx_refseq_in_list": results_pos} | |
| 668 | |
| 669 if DR_path!=None: # multi machine cluster mode. | |
| 670 strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) | |
| 671 P_class_dir=os.path.join(DR_path,P_class) | |
| 672 if os.path.exists(P_class_dir): | |
| 673 if not os.path.isdir(P_class_dir): | |
| 674 raise RuntimeError("P_class_dir is not a directory") | |
| 675 else: | |
| 676 os.mkdir(P_class_dir) | |
| 677 import pickle | |
| 678 fic_name=os.path.join(P_class_dir,checkReportTitle(refseq_name[results_pos])) | |
| 679 items_to_save=(analysis_name,seed,added_whole_coverage,Redundant,P_left,P_right,Permuted, \ | |
| 680 P_orient,termini_coverage_norm_close,picMaxPlus_norm_close,picMaxMinus_norm_close, \ | |
| 681 gen_len,tot_reads,P_seqcoh,phage_plus_norm,phage_minus_norm,ArtPackmode,termini, \ | |
| 682 forward,reverse,ArtOrient,ArtcohesiveSeq,termini_coverage_close,picMaxPlus_close, \ | |
| 683 picMaxMinus_close,picOUT_norm_forw,picOUT_norm_rev,picOUT_forw,picOUT_rev, \ | |
| 684 lost_perc,ave_whole_cov,R1,R2,R3,host,host_len,host_whole_coverage,picMaxPlus_host, \ | |
| 685 picMaxMinus_host,surrounding,drop_cov,paired, insert,phage_hybrid_coverage, \ | |
| 686 host_hybrid_coverage,added_paired_whole_coverage,Mu_like,test_run,P_class,P_type,\ | |
| 687 P_concat,results_pos) | |
| 688 with open(fic_name,'wb') as f: | |
| 689 pickle.dump(items_to_save,f) | |
| 690 f.close() | |
| 691 | |
| 692 return SeqStats(P_class, P_left, P_right, P_type, P_orient, ave_whole_cov, phage_plus_norm, phage_minus_norm, ArtcohesiveSeq, P_seqcoh, Redundant, Mu_like, \ | |
| 693 added_whole_coverage, Permuted, termini_coverage_norm_close, picMaxPlus_norm_close, picMaxMinus_norm_close, gen_len, termini_coverage_close, \ | |
| 694 ArtPackmode, termini, forward, reverse, ArtOrient, picMaxPlus_close, picMaxMinus_close, picOUT_norm_forw, picOUT_norm_rev, picOUT_forw, picOUT_rev, \ | |
| 695 lost_perc, R1, R2, R3, picMaxPlus_host, picMaxMinus_host, drop_cov, added_paired_whole_coverage, P_concat) |
