Mercurial > repos > bioit_sciensano > phagetermvirome
diff _modules/common_readsCoverage_processing.py @ 0:69e8f12c8b31 draft
"planemo upload"
author | bioit_sciensano |
---|---|
date | Fri, 11 Mar 2022 15:06:20 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/_modules/common_readsCoverage_processing.py Fri Mar 11 15:06:20 2022 +0000 @@ -0,0 +1,695 @@ +## @file common_readsCoverage_processing.py +# +# VL: here I gathered functions that are common to both GPU and mono/multi CPU versions. +# These functions are called after the mapping is done and all the counters are filled from mapping output results. +from __future__ import print_function + +from time import gmtime, strftime +import heapq +import itertools + +import numpy as np +import pandas as pd +# Statistics +from scipy import stats +from statsmodels.sandbox.stats.multicomp import multipletests +from sklearn.tree import DecisionTreeRegressor #TODO VL: fix issue on importing that + +from _modules.utilities import checkReportTitle +from _modules.SeqStats import SeqStats + +import os + + +k_no_match_for_contig=1 + +def wholeCov(whole_coverage, gen_len): + """Calculate the coverage for whole read alignments and its average""" + if gen_len == 0: + return whole_coverage, 1 + total_cov = sum(whole_coverage[0]) + sum(whole_coverage[1]) + ave_whole_cov = float(total_cov) / (2 * float(gen_len)) + added_whole_coverage = [x + y for x, y in zip(whole_coverage[0], whole_coverage[1])] + return added_whole_coverage, ave_whole_cov + +def testwholeCov(added_whole_coverage, ave_whole_cov, test): + """Return information about whole coverage.""" + if test: + return "" + if ave_whole_cov < 50: + print("\nWARNING: average coverage is under the limit of the software (50)") + elif ave_whole_cov < 200: + print("\nWARNING: average coverage is low (<200), Li's method is presumably unreliable\n") + drop_cov = [] + start_pos = last_pos = count_pos = 0 + for pos in range(len(added_whole_coverage)): + if added_whole_coverage[pos] < (ave_whole_cov / 1.5): + if pos == last_pos+1: + count_pos += 1 + last_pos = pos + else: + if count_pos > 100: + drop_cov.append( (start_pos,last_pos+1) ) + last_pos = start_pos = pos + count_pos = 0 + last_pos = pos + return drop_cov + +def maxPaired(paired_whole_coverage, whole_coverage): + """Max paired coverage using whole coverage, counter edge effect with paired-ends.""" + pwc = paired_whole_coverage[:] + wc = whole_coverage[:] + for i in range(len(pwc)): + for j in range(len(pwc[i])): + if pwc[i][j] < wc[i][j]: + pwc[i][j] = wc[i][j] + return pwc + +def replaceNormMean(norm_cov): + """Replace the values not normalised due to covLimit by mean.""" + nc_sum = nc_count = 0 + for nc in norm_cov: + if nc > 0: + nc_sum += nc + nc_count += 1 + if nc_count == 0: + mean_nc = 0 + else: + mean_nc = nc_sum / float(nc_count) + for i in range(len(norm_cov)): + if norm_cov[i] == 0: + norm_cov[i] = mean_nc + return norm_cov, mean_nc + +def normCov(termini_coverage, whole_coverage, covLimit, edge): + """Return the termini_coverage normalised by the whole coverage (% of coverage due to first base).""" + normalised_coverage = [len(termini_coverage[0])*[0], len(termini_coverage[0])*[0]] + termini_len = len(termini_coverage[0]) + mean_nc = [1,1] + for i in range(len(termini_coverage)): + for j in range(len(termini_coverage[i])): + if j < edge or j > termini_len-edge: + continue + if whole_coverage[i][j] >= covLimit: + if float(whole_coverage[i][j]) != 0: + normalised_coverage[i][j] = float(termini_coverage[i][j]) / float(whole_coverage[i][j]) + else: + normalised_coverage[i][j] = 0 + else: + normalised_coverage[i][j] = 0 + normalised_coverage[i], mean_nc[i] = replaceNormMean(normalised_coverage[i]) + return normalised_coverage, mean_nc + +def RemoveEdge(tableau, edge): + return tableau[edge:-edge] + +def usedReads(coverage, tot_reads): + """Retrieve the number of reads after alignment and calculate the percentage of reads lost.""" + used_reads = sum(coverage[0]) + sum(coverage[1]) + lost_reads = tot_reads - used_reads + lost_perc = (float(tot_reads) - float(used_reads))/float(tot_reads) * 100 + return used_reads, lost_reads, lost_perc + +### PEAK functions +def picMax(coverage, nbr_pic): + """COORDINATES (coverage value, position) of the nbr_pic largest coverage value.""" + if coverage == [[],[]] or coverage == []: + return "", "", "" + picMaxPlus = heapq.nlargest(nbr_pic, zip(coverage[0], itertools.count())) + picMaxMinus = heapq.nlargest(nbr_pic, zip(coverage[1], itertools.count())) + TopFreqH = max(max(np.array(list(zip(*picMaxPlus))[0])), max(np.array(list(zip(*picMaxMinus))[0]))) + return picMaxPlus, picMaxMinus, TopFreqH + +def RemoveClosePicMax(picMax, gen_len, nbr_base): + """Remove peaks that are too close of the maximum (nbr_base around)""" + if nbr_base == 0: + return picMax[1:], [picMax[0]] + picMaxRC = picMax[:] + posMax = picMaxRC[0][1] + LimSup = posMax + nbr_base + LimInf = posMax - nbr_base + if LimSup < gen_len and LimInf >= 0: + PosOut = list(range(LimInf,LimSup)) + elif LimSup >= gen_len: + TurnSup = LimSup - gen_len + PosOut = list(range(posMax,gen_len))+list(range(0,TurnSup)) + list(range(LimInf,posMax)) + elif LimInf < 0: + TurnInf = gen_len + LimInf + PosOut = list(range(0,posMax))+list(range(TurnInf,gen_len)) + list(range(posMax,LimSup)) + picMaxOK = [] + picOUT = [] + for peaks in picMaxRC: + if peaks[1] not in PosOut: + picMaxOK.append(peaks) + else: + picOUT.append(peaks) + return picMaxOK, picOUT + +def addClosePic(picList, picClose, norm = 0): + """Add coverage value of close peaks to the top peak. Remove picClose in picList if exist.""" + if norm: + if picClose[0][0] >= 0.5: + return picList, [picClose[0]] + picListOK = picList[:] + cov_add = 0 + for cov in picClose: + cov_add += cov[0] + picListOK[cov[1]] = 0.01 + picListOK[picClose[0][1]] = cov_add + return picListOK, picClose + +def remove_pics(arr,n): + '''Removes the n highest values from the array''' + arr=np.array(arr) + pic_pos=arr.argsort()[-n:][::-1] + arr2=np.delete(arr,pic_pos) + return arr2 + +def gamma(X): + """Apply a gamma distribution.""" + X = np.array(X, dtype=np.int64) + v = remove_pics(X, 3) + + dist_max = float(max(v)) + if dist_max == 0: + return np.array([1.00] * len(X)) + + actual = np.bincount(v) + fit_alpha, fit_loc, fit_beta = stats.gamma.fit(v) + expected = stats.gamma.pdf(np.arange(0, dist_max + 1, 1), fit_alpha, loc=fit_loc, scale=fit_beta) * sum(actual) + + return stats.gamma.pdf(X, fit_alpha, loc=fit_loc, scale=fit_beta) + + +# STATISTICS +def test_pics_decision_tree(whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close): + """Fits a gamma distribution using a decision tree.""" + L = len(whole_coverage[0]) + res = pd.DataFrame({"Position": np.array(range(L)) + 1, "termini_plus": termini_coverage[0], + "SPC_norm_plus": termini_coverage_norm[0], "SPC_norm_minus": termini_coverage_norm[1], + "SPC_norm_plus_close": termini_coverage_norm_close[0], + "SPC_norm_minus_close": termini_coverage_norm_close[1], "termini_minus": termini_coverage[1], + "cov_plus": whole_coverage[0], "cov_minus": whole_coverage[1]}) + + res["cov"] = res["cov_plus"].values + res["cov_minus"].values + + res["R_plus"] = list(map(float, termini_coverage[0])) // np.mean(termini_coverage[0]) + res["R_minus"] = list(map(float, termini_coverage[1])) // np.mean(termini_coverage[1]) + + regr = DecisionTreeRegressor(max_depth=3, min_samples_leaf=100) + X = np.arange(L) + X = X[:, np.newaxis] + y = res["cov"].values + regr.fit(X, y) + + # Predict + y_1 = regr.predict(X) + res["covnode"] = y_1 + covnodes = np.unique(y_1) + thres = np.mean(whole_coverage[0]) / 2 + covnodes = [n for n in covnodes if n > thres] + + for node in covnodes: + X = res[res["covnode"] == node]["termini_plus"].values + res.loc[res["covnode"] == node, "pval_plus"] = gamma(X) + X = res[res["covnode"] == node]["termini_minus"].values + res.loc[res["covnode"] == node, "pval_minus"] = gamma(X) + + res.loc[res.pval_plus > 1, 'pval_plus'] = 1.00 + res.loc[res.pval_minus > 1, 'pval_minus'] = 1.00 + res = res.fillna(1.00) + + res['pval_plus_adj'] = multipletests(res["pval_plus"].values, alpha=0.01, method="bonferroni")[1] + res['pval_minus_adj'] = multipletests(res["pval_minus"].values, alpha=0.01, method="bonferroni")[1] + + res = res.fillna(1.00) + + res_plus = pd.DataFrame( + {"Position": res['Position'], "SPC_std": res['SPC_norm_plus'] * 100, "SPC": res['SPC_norm_plus_close'] * 100, + "pval_gamma": res['pval_plus'], "pval_gamma_adj": res['pval_plus_adj']}) + res_minus = pd.DataFrame( + {"Position": res['Position'], "SPC_std": res['SPC_norm_minus'] * 100, "SPC": res['SPC_norm_minus_close'] * 100, + "pval_gamma": res['pval_minus'], "pval_gamma_adj": res['pval_minus_adj']}) + + res_plus.sort_values("SPC", ascending=False, inplace=True) + res_minus.sort_values("SPC", ascending=False, inplace=True) + + res_plus.reset_index(drop=True, inplace=True) + res_minus.reset_index(drop=True, inplace=True) + + return res, res_plus, res_minus + +### SCORING functions +# Li's methodology +def ratioR1(TopFreqH, used_reads, gen_len): + """Calculate the ratio H/A (R1) = highest frequency/average frequency. For Li's methodology.""" + AveFreq = (float(used_reads)/float(gen_len)/2) + if AveFreq == 0: + return 0, 0 + R1 = float(TopFreqH)/float(AveFreq) + return R1, AveFreq + +def ratioR(picMax): + """Calculate the T1/T2 = Top 1st frequency/Second higher frequency. For Li's methodology.""" + T1 = picMax[0][0] + T2 = max(1,picMax[1][0]) + R = float(T1)/float(T2) + return round(R) + + +def packMode(R1, R2, R3): + """Make the prognosis about the phage packaging mode and termini type. For Li's methodology.""" + packmode = "OTHER" + termini = "" + forward = "" + reverse = "" + + if R1 < 30: + termini = "Absence" + if R2 < 3: + forward = "No Obvious Termini" + if R3 < 3: + reverse = "No Obvious Termini" + elif R1 > 100: + termini = "Fixed" + if R2 < 3: + forward = "Multiple-Pref. Term." + if R3 < 3: + reverse = "Multiple-Pref. Term." + else: + termini = "Preferred" + if R2 < 3: + forward = "Multiple-Pref. Term." + if R3 < 3: + reverse = "Multiple-Pref. Term." + + if R2 >= 3: + forward = "Obvious Termini" + if R3 >= 3: + reverse = "Obvious Termini" + + if R2 >= 3 and R3 >= 3: + packmode = "COS" + if R2 >= 3 and R3 < 3: + packmode = "PAC" + if R2 < 3 and R3 >= 3: + packmode = "PAC" + return packmode, termini, forward, reverse + +### PHAGE Information +def orientation(picMaxPlus, picMaxMinus): + """Return phage termini orientation.""" + if not picMaxPlus and not picMaxMinus: + return "NA" + if picMaxPlus and not picMaxMinus: + return "Forward" + if not picMaxPlus and picMaxMinus: + return "Reverse" + if picMaxPlus and picMaxMinus: + if picMaxPlus[0][0] > picMaxMinus[0][0]: + return "Forward" + elif picMaxMinus[0][0] > picMaxPlus[0][0]: + return "Reverse" + elif picMaxMinus[0][0] == picMaxPlus[0][0]: + return "NA" + + +def typeCOS(PosPlus, PosMinus, nbr_lim): + """ Return type of COS sequence.""" + if PosPlus < PosMinus and abs(PosPlus-PosMinus) < nbr_lim: + return "COS (5')", "Lambda" + else: + return "COS (3')", "HK97" + +def sequenceCohesive(Packmode, refseq, picMaxPlus, picMaxMinus, nbr_lim): + """Return cohesive sequence for COS phages.""" + if Packmode != 'COS': + return '', Packmode + PosPlus = picMaxPlus[0][1] + PosMinus = picMaxMinus[0][1] + + SC_class, SC_type = typeCOS(PosPlus, PosMinus, nbr_lim) + + if SC_class == "COS (5')": + if abs(PosMinus - PosPlus) < nbr_lim: + seqcoh = refseq[min(PosPlus, PosMinus):max(PosPlus, PosMinus) + 1] + return seqcoh, Packmode + else: + seqcoh = refseq[max(PosPlus, PosMinus) + 1:] + refseq[:min(PosPlus, PosMinus)] + return seqcoh, Packmode + + elif SC_class == "COS (3')": + if abs(PosMinus - PosPlus) < nbr_lim: + seqcoh = refseq[min(PosPlus, PosMinus) + 1:max(PosPlus, PosMinus)] + return seqcoh, Packmode + else: + seqcoh = refseq[max(PosPlus, PosMinus) + 1:] + refseq[:min(PosPlus, PosMinus)] + return seqcoh, Packmode + else: + return '', Packmode + +def selectSignificant(table, pvalue, limit): + """Return significant peaks over a limit""" + table_pvalue = table.loc[lambda df: df.pval_gamma_adj < pvalue, :] + table_pvalue_limit = table_pvalue.loc[lambda df: df.SPC > limit, :] + table_pvalue_limit.reset_index(drop=True, inplace=True) + return table_pvalue_limit + +def testMu(paired, list_hybrid, gen_len, used_reads, seed, insert, phage_hybrid_coverage, Mu_threshold, hostseq): + """Return Mu if enough hybrid reads compared to theory.""" + if hostseq == "": + return 0, -1, -1, "" + if paired != "" and len(insert) != 0: + insert_mean = sum(insert) / len(insert) + else: + insert_mean = max(100, seed+10) + Mu_limit = ((insert_mean - seed) / float(gen_len)) * used_reads/2 + test = 0 + Mu_term_plus = "Random" + Mu_term_minus = "Random" + picMaxPlus_Mu, picMaxMinus_Mu, TopFreqH_phage_hybrid = picMax(phage_hybrid_coverage, 1) + picMaxPlus_Mu = picMaxPlus_Mu[0][1] + picMaxMinus_Mu = picMaxMinus_Mu[0][1] + + # Orientation + if list_hybrid[0] > list_hybrid[1]: + P_orient = "Forward" + elif list_hybrid[1] > list_hybrid[0]: + P_orient = "Reverse" + else: + P_orient = "" + + # Termini + if list_hybrid[0] > ( Mu_limit * Mu_threshold ): + test = 1 + pos_to_check = range(picMaxPlus_Mu+1,gen_len) + range(0,100) + for pos in pos_to_check: + if phage_hybrid_coverage[0][pos] >= max(1,phage_hybrid_coverage[0][picMaxPlus_Mu]/4): + Mu_term_plus = pos + picMaxPlus_Mu = pos + else: + Mu_term_plus = pos + break + # Reverse + if list_hybrid[1] > ( Mu_limit * Mu_threshold ): + test = 1 + pos_to_check = range(0,picMaxMinus_Mu)[::-1] + range(gen_len-100,gen_len)[::-1] + for pos in pos_to_check: + if phage_hybrid_coverage[1][pos] >= max(1,phage_hybrid_coverage[1][picMaxMinus_Mu]/4): + Mu_term_minus = pos + picMaxMinus_Mu = pos + else: + Mu_term_minus = pos + break + return test, Mu_term_plus, Mu_term_minus, P_orient + +### DECISION Process +def decisionProcess(plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid, + used_reads, seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq): + """ .""" + P_orient = "NA" + P_seqcoh = "" + P_concat = "" + P_type = "-" + Mu_like = 0 + P_left = "Random" + P_right = "Random" + # 2 peaks sig. + if not plus_significant.empty and not minus_significant.empty: + # Multiple + if (len(plus_significant["SPC"]) > 1 or len(minus_significant["SPC"]) > 1): + if not (plus_significant["SPC"][0] > limit_fixed or minus_significant["SPC"][0] > limit_fixed): + Redundant = 1 + P_left = "Multiple" + P_right = "Multiple" + Permuted = "Yes" + P_class = "-" + P_type = "-" + return Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like + + dist_peak = abs(plus_significant['Position'][0] - minus_significant['Position'][0]) + dist_peak_over = abs(abs(plus_significant['Position'][0] - minus_significant['Position'][0]) - gen_len) + P_left = plus_significant['Position'][0] + P_right = minus_significant['Position'][0] + # COS + if (dist_peak <= 2) or (dist_peak_over <= 2): + Redundant = 0 + Permuted = "No" + P_class = "COS" + P_type = "-" + elif (dist_peak < 20 and dist_peak > 2) or (dist_peak_over < 20 and dist_peak_over > 2): + Redundant = 0 + Permuted = "No" + P_class, P_type = typeCOS(plus_significant["Position"][0], minus_significant["Position"][0], gen_len / 2) + P_seqcoh, packstat = sequenceCohesive("COS", refseq, [ + ((plus_significant["SPC"][0]), (plus_significant["Position"][0]) - 1)], [((minus_significant["SPC"][0]), + ( + minus_significant["Position"][ + 0]) - 1)], gen_len / 2) + # DTR + elif (dist_peak <= 1000) or (dist_peak_over <= 1000): + Redundant = 1 + Permuted = "No" + P_class = "DTR (short)" + P_type = "T7" + P_seqcoh, packstat = sequenceCohesive("COS", refseq, [ + ((plus_significant["SPC"][0]), (plus_significant["Position"][0]) - 1)], [((minus_significant["SPC"][0]), + ( + minus_significant["Position"][ + 0]) - 1)], gen_len / 2) + elif (dist_peak <= 0.1 * gen_len) or (dist_peak_over <= 0.1 * gen_len): + Redundant = 1 + Permuted = "No" + P_class = "DTR (long)" + P_type = "T5" + P_seqcoh, packstat = sequenceCohesive("COS", refseq, [ + ((plus_significant["SPC"][0]), (plus_significant["Position"][0]) - 1)], [((minus_significant["SPC"][0]), + ( + minus_significant["Position"][ + 0]) - 1)], gen_len / 2) + else: + Redundant = 1 + Permuted = "No" + P_class = "-" + P_type = "-" + # 1 peak sig. + elif not plus_significant.empty and minus_significant.empty or plus_significant.empty and not minus_significant.empty: + Redundant = 1 + Permuted = "Yes" + P_class = "Headful (pac)" + P_type = "P1" + if paired != "": + if R1 == 0 or len(insert) == 0: + P_concat = 1 + else: + P_concat = round((sum(insert) / len(insert)) / R1) - 1 + if not plus_significant.empty: + P_left = plus_significant['Position'][0] + P_right = "Distributed" + P_orient = "Forward" + else: + P_left = "Distributed" + P_right = minus_significant['Position'][0] + P_orient = "Reverse" + # 0 peak sig. + elif plus_significant.empty and minus_significant.empty: + Mu_like, Mu_term_plus, Mu_term_minus, P_orient = testMu(paired, list_hybrid, gen_len, used_reads, seed, insert, + phage_hybrid_coverage, Mu_threshold, hostseq) + if Mu_like: + Redundant = 0 + Permuted = "No" + P_class = "Mu-like" + P_type = "Mu" + P_left = Mu_term_plus + P_right = Mu_term_minus + else: + Redundant = 1 + Permuted = "Yes" + P_class = "-" + P_type = "-" + + return Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like + +# Processes coverage values for a sequence. +def processCovValuesForSeq(refseq,hostseq,refseq_name,refseq_liste,seed,analysis_name,tot_reads,results_pos,test_run, paired,edge,host,test, surrounding,limit_preferred,limit_fixed,Mu_threshold,\ + termini_coverage,whole_coverage,paired_whole_coverage,phage_hybrid_coverage,host_hybrid_coverage, host_whole_coverage,insert,list_hybrid,reads_tested,DR,DR_path=None): + + print("\n\nFinished calculating coverage values, the remainder should be completed rapidly\n", + strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) + + # WHOLE Coverage : Average, Maximum and Minimum + added_whole_coverage, ave_whole_cov = wholeCov(whole_coverage, len(refseq)) + added_paired_whole_coverage, ave_paired_whole_cov = wholeCov(paired_whole_coverage, len(refseq)) + added_host_whole_coverage, ave_host_whole_cov = wholeCov(host_whole_coverage, len(hostseq)) + + drop_cov = testwholeCov(added_whole_coverage, ave_whole_cov, test_run) + + # NORM pic by whole coverage (1 base) + if paired != "": + #paired_whole_coverage_test = maxPaired(paired_whole_coverage, whole_coverage) + termini_coverage_norm, mean_nc = normCov(termini_coverage, paired_whole_coverage, max(10, ave_whole_cov / 1.5), + edge) + else: + termini_coverage_norm, mean_nc = normCov(termini_coverage, whole_coverage, max(10, ave_whole_cov / 1.5), edge) + + # REMOVE edge + termini_coverage[0] = RemoveEdge(termini_coverage[0], edge) + termini_coverage[1] = RemoveEdge(termini_coverage[1], edge) + termini_coverage_norm[0] = RemoveEdge(termini_coverage_norm[0], edge) + termini_coverage_norm[1] = RemoveEdge(termini_coverage_norm[1], edge) + whole_coverage[0] = RemoveEdge(whole_coverage[0], edge) + whole_coverage[1] = RemoveEdge(whole_coverage[1], edge) + paired_whole_coverage[0] = RemoveEdge(paired_whole_coverage[0], edge) + paired_whole_coverage[1] = RemoveEdge(paired_whole_coverage[1], edge) + added_whole_coverage = RemoveEdge(added_whole_coverage, edge) + added_paired_whole_coverage = RemoveEdge(added_paired_whole_coverage, edge) + added_host_whole_coverage = RemoveEdge(added_host_whole_coverage, edge) + phage_hybrid_coverage[0] = RemoveEdge(phage_hybrid_coverage[0], edge) + phage_hybrid_coverage[1] = RemoveEdge(phage_hybrid_coverage[1], edge) + host_whole_coverage[0] = RemoveEdge(host_whole_coverage[0], edge) + host_whole_coverage[1] = RemoveEdge(host_whole_coverage[1], edge) + host_hybrid_coverage[0] = RemoveEdge(host_hybrid_coverage[0], edge) + host_hybrid_coverage[1] = RemoveEdge(host_hybrid_coverage[1], edge) + refseq = RemoveEdge(refseq, edge) + if host != "": + hostseq = RemoveEdge(hostseq, edge) + gen_len = len(refseq) + host_len = len(hostseq) + if test == "DL": + gen_len = 100000 + + # READS Total, Used and Lost + used_reads, lost_reads, lost_perc = usedReads(termini_coverage, reads_tested) + + # PIC Max + picMaxPlus, picMaxMinus, TopFreqH = picMax(termini_coverage, 5) + picMaxPlus_norm, picMaxMinus_norm, TopFreqH_norm = picMax(termini_coverage_norm, 5) + picMaxPlus_host, picMaxMinus_host, TopFreqH_host = picMax(host_whole_coverage, 5) + + ### ANALYSIS + + ## Close Peaks + picMaxPlus, picOUT_forw = RemoveClosePicMax(picMaxPlus, gen_len, surrounding) + picMaxMinus, picOUT_rev = RemoveClosePicMax(picMaxMinus, gen_len, surrounding) + picMaxPlus_norm, picOUT_norm_forw = RemoveClosePicMax(picMaxPlus_norm, gen_len, surrounding) + picMaxMinus_norm, picOUT_norm_rev = RemoveClosePicMax(picMaxMinus_norm, gen_len, surrounding) + + termini_coverage_close = termini_coverage[:] + termini_coverage_close[0], picOUT_forw = addClosePic(termini_coverage[0], picOUT_forw) + termini_coverage_close[1], picOUT_rev = addClosePic(termini_coverage[1], picOUT_rev) + + termini_coverage_norm_close = termini_coverage_norm[:] + termini_coverage_norm_close[0], picOUT_norm_forw = addClosePic(termini_coverage_norm[0], picOUT_norm_forw, 1) + termini_coverage_norm_close[1], picOUT_norm_rev = addClosePic(termini_coverage_norm[1], picOUT_norm_rev, 1) + + ## Statistical Analysis + picMaxPlus_norm_close, picMaxMinus_norm_close, TopFreqH_norm = picMax(termini_coverage_norm_close, 5) + phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(paired_whole_coverage, termini_coverage, + termini_coverage_norm, + termini_coverage_norm_close) + # VL: comment that since the 2 different conditions lead to the execution of the same piece of code... + # if paired != "": + # phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(paired_whole_coverage, termini_coverage, + # termini_coverage_norm, + # termini_coverage_norm_close) + # else: + # phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(whole_coverage, termini_coverage, + # termini_coverage_norm, + # termini_coverage_norm_close) + + + ## LI Analysis + picMaxPlus_close, picMaxMinus_close, TopFreqH = picMax(termini_coverage_close, 5) + + R1, AveFreq = ratioR1(TopFreqH, used_reads, gen_len) + R2 = ratioR(picMaxPlus_close) + R3 = ratioR(picMaxMinus_close) + + ArtPackmode, termini, forward, reverse = packMode(R1, R2, R3) + ArtOrient = orientation(picMaxPlus_close, picMaxMinus_close) + ArtcohesiveSeq, ArtPackmode = sequenceCohesive(ArtPackmode, refseq, picMaxPlus_close, picMaxMinus_close, + gen_len / 2) + + ### DECISION Process + + # PEAKS Significativity + plus_significant = selectSignificant(phage_plus_norm, 1.0 / gen_len, limit_preferred) + minus_significant = selectSignificant(phage_minus_norm, 1.0 / gen_len, limit_preferred) + + # DECISION + Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like = decisionProcess( + plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid, used_reads, + seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq) + + ### Results + if len(refseq_liste) != 1: + if P_class == "-": + if P_left == "Random" and P_right == "Random": + P_class = "UNKNOWN" + else: + P_class = "NEW" + DR[P_class][checkReportTitle(refseq_name[results_pos])] = {"analysis_name": analysis_name, "seed": seed, + "added_whole_coverage": added_whole_coverage, + "Redundant": Redundant, "P_left": P_left, + "P_right": P_right, "Permuted": Permuted, + "P_orient": P_orient, + "termini_coverage_norm_close": termini_coverage_norm_close, + "picMaxPlus_norm_close": picMaxPlus_norm_close, + "picMaxMinus_norm_close": picMaxMinus_norm_close, + "gen_len": gen_len, "tot_reads": tot_reads, + "P_seqcoh": P_seqcoh, + "phage_plus_norm": phage_plus_norm, + "phage_minus_norm": phage_minus_norm, + "ArtPackmode": ArtPackmode, "termini": termini, + "forward": forward, "reverse": reverse, + "ArtOrient": ArtOrient, + "ArtcohesiveSeq": ArtcohesiveSeq, + "termini_coverage_close": termini_coverage_close, + "picMaxPlus_close": picMaxPlus_close, + "picMaxMinus_close": picMaxMinus_close, + "picOUT_norm_forw": picOUT_norm_forw, + "picOUT_norm_rev": picOUT_norm_rev, + "picOUT_forw": picOUT_forw, + "picOUT_rev": picOUT_rev, "lost_perc": lost_perc, + "ave_whole_cov": ave_whole_cov, "R1": R1, "R2": R2, + "R3": R3, "host": host, "host_len": host_len, + "host_whole_coverage": host_whole_coverage, + "picMaxPlus_host": picMaxPlus_host, + "picMaxMinus_host": picMaxMinus_host, + "surrounding": surrounding, "drop_cov": drop_cov, + "paired": paired, "insert": insert, + "phage_hybrid_coverage": phage_hybrid_coverage, + "host_hybrid_coverage": host_hybrid_coverage, + "added_paired_whole_coverage": added_paired_whole_coverage, + "Mu_like": Mu_like, "test_run": test_run, + "P_class": P_class, "P_type": P_type, + "P_concat": P_concat, + "idx_refseq_in_list": results_pos} + + if DR_path!=None: # multi machine cluster mode. + strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) + P_class_dir=os.path.join(DR_path,P_class) + if os.path.exists(P_class_dir): + if not os.path.isdir(P_class_dir): + raise RuntimeError("P_class_dir is not a directory") + else: + os.mkdir(P_class_dir) + import pickle + fic_name=os.path.join(P_class_dir,checkReportTitle(refseq_name[results_pos])) + items_to_save=(analysis_name,seed,added_whole_coverage,Redundant,P_left,P_right,Permuted, \ + P_orient,termini_coverage_norm_close,picMaxPlus_norm_close,picMaxMinus_norm_close, \ + gen_len,tot_reads,P_seqcoh,phage_plus_norm,phage_minus_norm,ArtPackmode,termini, \ + forward,reverse,ArtOrient,ArtcohesiveSeq,termini_coverage_close,picMaxPlus_close, \ + picMaxMinus_close,picOUT_norm_forw,picOUT_norm_rev,picOUT_forw,picOUT_rev, \ + lost_perc,ave_whole_cov,R1,R2,R3,host,host_len,host_whole_coverage,picMaxPlus_host, \ + picMaxMinus_host,surrounding,drop_cov,paired, insert,phage_hybrid_coverage, \ + host_hybrid_coverage,added_paired_whole_coverage,Mu_like,test_run,P_class,P_type,\ + P_concat,results_pos) + with open(fic_name,'wb') as f: + pickle.dump(items_to_save,f) + f.close() + + return SeqStats(P_class, P_left, P_right, P_type, P_orient, ave_whole_cov, phage_plus_norm, phage_minus_norm, ArtcohesiveSeq, P_seqcoh, Redundant, Mu_like, \ + added_whole_coverage, Permuted, termini_coverage_norm_close, picMaxPlus_norm_close, picMaxMinus_norm_close, gen_len, termini_coverage_close, \ + ArtPackmode, termini, forward, reverse, ArtOrient, picMaxPlus_close, picMaxMinus_close, picOUT_norm_forw, picOUT_norm_rev, picOUT_forw, picOUT_rev, \ + lost_perc, R1, R2, R3, picMaxPlus_host, picMaxMinus_host, drop_cov, added_paired_whole_coverage, P_concat)