Mercurial > repos > bgruening > bg_statistical_hypothesis_testing
view statistical_hypothesis_testing.py @ 0:178b22349b79 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/statistics commit 7c5002672919ca1e5eacacb835a4ce66ffa19656
author | bgruening |
---|---|
date | Mon, 21 Nov 2022 18:08:27 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python """ """ import argparse import numpy as np from scipy import stats def columns_to_values(args, line): # here you go over every list samples = [] for i in args: cols = line.split("\t") sample_list = [] for row in i: sample_list.append(cols[row - 1]) samples.append(list(map(int, sample_list))) return samples def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile", required=True, help="Tabular file.") parser.add_argument( "-o", "--outfile", required=True, help="Path to the output file." ) parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument( "--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;", ) parser.add_argument("--test_id", help="statistical test method") parser.add_argument( "--mwu_use_continuity", action="store_true", default=False, help="Whether a continuity correction (1/2.) should be taken into account.", ) parser.add_argument( "--equal_var", action="store_true", default=False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.", ) parser.add_argument( "--reta", action="store_true", default=False, help="Whether or not to return the internally computed a values.", ) parser.add_argument( "--fisher", action="store_true", default=False, help="if true then Fisher definition is used", ) parser.add_argument( "--bias", action="store_true", default=False, help="if false,then the calculations are corrected for statistical bias", ) parser.add_argument( "--inclusive1", action="store_true", default=False, help="if false,lower_limit will be ignored", ) parser.add_argument( "--inclusive2", action="store_true", default=False, help="if false,higher_limit will be ignored", ) parser.add_argument( "--inclusive", action="store_true", default=False, help="if false,limit will be ignored", ) parser.add_argument( "--printextras", action="store_true", default=False, help="If True, if there are extra points a warning is raised saying how many of those points there are", ) parser.add_argument( "--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.", ) parser.add_argument( "--correction", action="store_true", default=False, help="continuity correction ", ) parser.add_argument( "--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)", ) parser.add_argument( "--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures", ) parser.add_argument( "--b", type=int, default=0, help="The number of bins to use for the histogram" ) parser.add_argument( "--N", type=int, default=0, help="Score that is compared to the elements in a." ) parser.add_argument( "--ddof", type=int, default=0, help="Degrees of freedom correction" ) parser.add_argument( "--score", type=int, default=0, help="Score that is compared to the elements in a.", ) parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument( "--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5", ) parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument( "--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds", ) parser.add_argument( "--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.", ) parser.add_argument( "--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic", ) parser.add_argument( "--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.", ) parser.add_argument( "--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e", ) parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, "w+") test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols is not None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(";"): barlett_samples.append(list(map(int, sample.split(",")))) if args.sample_one_cols is not None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(",") if args.sample_two_cols is not None: sample_two_cols = args.sample_two_cols.split(",") sample2 = 1 for line in open(infile): sample_one = [] sample_two = [] cols = line.strip().split("\t") if sample0 == 1: b_samples = columns_to_values(barlett_samples, line) if sample1 == 1: for index in sample_one_cols: sample_one.append(cols[int(index) - 1]) if sample2 == 1: for index in sample_two_cols: sample_two.append(cols[int(index) - 1]) if test_id.strip() == "describe": size, min_max, mean, uv, bs, bk = stats.describe( list(map(float, sample_one)) ) cols.append(size) cols.append(min_max) cols.append(mean) cols.append(uv) cols.append(bs) cols.append(bk) elif test_id.strip() == "mode": vals, counts = stats.mode(list(map(float, sample_one))) cols.append(vals) cols.append(counts) elif test_id.strip() == "nanmean": m = stats.nanmean(list(map(float, sample_one))) cols.append(m) elif test_id.strip() == "nanmedian": m = stats.nanmedian(list(map(float, sample_one))) cols.append(m) elif test_id.strip() == "kurtosistest": z_value, p_value = stats.kurtosistest(list(map(float, sample_one))) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "variation": ra = stats.variation(list(map(float, sample_one))) cols.append(ra) elif test_id.strip() == "itemfreq": freq = np.unique(list(map(float, sample_one)), return_counts=True) for i in freq: elements = ",".join(list(map(str, i))) cols.append(elements) elif test_id.strip() == "nanmedian": m = stats.nanmedian(list(map(float, sample_one))) cols.append(m) elif test_id.strip() == "variation": ra = stats.variation(list(map(float, sample_one))) cols.append(ra) elif test_id.strip() == "boxcox_llf": IIf = stats.boxcox_llf(imbda, list(map(float, sample_one))) cols.append(IIf) elif test_id.strip() == "tiecorrect": fa = stats.tiecorrect(list(map(float, sample_one))) cols.append(fa) elif test_id.strip() == "rankdata": r = stats.rankdata(list(map(float, sample_one)), method=args.md) cols.append(r) elif test_id.strip() == "nanstd": s = stats.nanstd(list(map(float, sample_one)), bias=args.bias) cols.append(s) elif test_id.strip() == "anderson": A2, critical, sig = stats.anderson( list(map(float, sample_one)), dist=args.dist ) cols.append(A2) for i in critical: cols.append(i) cols.append(",") for i in sig: cols.append(i) elif test_id.strip() == "binom_test": p_value = stats.binom_test(list(map(float, sample_one)), n=args.n, p=args.p) cols.append(p_value) elif test_id.strip() == "gmean": gm = stats.gmean(list(map(float, sample_one)), dtype=args.dtype) cols.append(gm) elif test_id.strip() == "hmean": hm = stats.hmean(list(map(float, sample_one)), dtype=args.dtype) cols.append(hm) elif test_id.strip() == "kurtosis": k = stats.kurtosis( list(map(float, sample_one)), axis=args.axis, fisher=args.fisher, bias=args.bias, ) cols.append(k) elif test_id.strip() == "moment": n_moment = stats.moment(list(map(float, sample_one)), n=args.n) cols.append(n_moment) elif test_id.strip() == "normaltest": k2, p_value = stats.normaltest(list(map(float, sample_one))) cols.append(k2) cols.append(p_value) elif test_id.strip() == "skew": skewness = stats.skew(list(map(float, sample_one)), bias=args.bias) cols.append(skewness) elif test_id.strip() == "skewtest": z_value, p_value = stats.skewtest(list(map(float, sample_one))) cols.append(z_value) cols.append(p_value) elif test_id.strip() == "sem": s = stats.sem(list(map(float, sample_one)), ddof=args.ddof) cols.append(s) elif test_id.strip() == "zscore": z = stats.zscore(list(map(float, sample_one)), ddof=args.ddof) for i in z: cols.append(i) elif test_id.strip() == "signaltonoise": s2n = stats.signaltonoise(list(map(float, sample_one)), ddof=args.ddof) cols.append(s2n) elif test_id.strip() == "percentileofscore": p = stats.percentileofscore( list(map(float, sample_one)), score=args.score, kind=args.kind ) cols.append(p) elif test_id.strip() == "bayes_mvs": c_mean, c_var, c_std = stats.bayes_mvs( list(map(float, sample_one)), alpha=args.alpha ) cols.append(c_mean) cols.append(c_var) cols.append(c_std) elif test_id.strip() == "sigmaclip": c, c_low, c_up = stats.sigmaclip( list(map(float, sample_one)), low=args.m, high=args.n ) cols.append(c) cols.append(c_low) cols.append(c_up) elif test_id.strip() == "kstest": d, p_value = stats.kstest( list(map(float, sample_one)), cdf=args.cdf, N=args.N, alternative=args.alternative, mode=args.mode, ) cols.append(d) cols.append(p_value) elif test_id.strip() == "chi2_contingency": chi2, p, dof, ex = stats.chi2_contingency( list(map(float, sample_one)), correction=args.correction, lambda_=args.lambda_, ) cols.append(chi2) cols.append(p) cols.append(dof) cols.append(ex) elif test_id.strip() == "tmean": if nf == 0 and mf == 0: mean = stats.tmean(list(map(float, sample_one))) else: mean = stats.tmean( list(map(float, sample_one)), (mf, nf), (inclusive1, inclusive2) ) cols.append(mean) elif test_id.strip() == "tmin": if mf == 0: min = stats.tmin(list(map(float, sample_one))) else: min = stats.tmin( list(map(float, sample_one)), lowerlimit=mf, inclusive=args.inclusive, ) cols.append(min) elif test_id.strip() == "tmax": if nf == 0: max = stats.tmax(list(map(float, sample_one))) else: max = stats.tmax( list(map(float, sample_one)), upperlimit=nf, inclusive=args.inclusive, ) cols.append(max) elif test_id.strip() == "tvar": if nf == 0 and mf == 0: var = stats.tvar(list(map(float, sample_one))) else: var = stats.tvar( list(map(float, sample_one)), (mf, nf), (inclusive1, inclusive2) ) cols.append(var) elif test_id.strip() == "tstd": if nf == 0 and mf == 0: std = stats.tstd(list(map(float, sample_one))) else: std = stats.tstd( list(map(float, sample_one)), (mf, nf), (inclusive1, inclusive2) ) cols.append(std) elif test_id.strip() == "tsem": if nf == 0 and mf == 0: s = stats.tsem(list(map(float, sample_one))) else: s = stats.tsem( list(map(float, sample_one)), (mf, nf), (inclusive1, inclusive2) ) cols.append(s) elif test_id.strip() == "scoreatpercentile": if nf == 0 and mf == 0: s = stats.scoreatpercentile( list(map(float, sample_one)), list(map(float, sample_two)), interpolation_method=args.interpolation, ) else: s = stats.scoreatpercentile( list(map(float, sample_one)), list(map(float, sample_two)), (mf, nf), interpolation_method=args.interpolation, ) for i in s: cols.append(i) elif test_id.strip() == "relfreq": if nf == 0 and mf == 0: rel, low_range, binsize, ex = stats.relfreq( list(map(float, sample_one)), args.b ) else: rel, low_range, binsize, ex = stats.relfreq( list(map(float, sample_one)), args.b, (mf, nf) ) for i in rel: cols.append(i) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "binned_statistic": if nf == 0 and mf == 0: st, b_edge, b_n = stats.binned_statistic( list(map(float, sample_one)), list(map(float, sample_two)), statistic=args.statistic, bins=args.b, ) else: st, b_edge, b_n = stats.binned_statistic( list(map(float, sample_one)), list(map(float, sample_two)), statistic=args.statistic, bins=args.b, range=(mf, nf), ) cols.append(st) cols.append(b_edge) cols.append(b_n) elif test_id.strip() == "threshold": if nf == 0 and mf == 0: o = stats.threshold(list(map(float, sample_one)), newval=args.new) else: o = stats.threshold( list(map(float, sample_one)), mf, nf, newval=args.new ) for i in o: cols.append(i) elif test_id.strip() == "trimboth": o = stats.trimboth( list(map(float, sample_one)), proportiontocut=args.proportiontocut ) for i in o: cols.append(i) elif test_id.strip() == "trim1": t1 = stats.trim1( list(map(float, sample_one)), proportiontocut=args.proportiontocut, tail=args.tail, ) for i in t1: cols.append(i) elif test_id.strip() == "histogram": if nf == 0 and mf == 0: hi, low_range, binsize, ex = stats.histogram( list(map(float, sample_one)), args.b ) else: hi, low_range, binsize, ex = stats.histogram( list(map(float, sample_one)), args.b, (mf, nf) ) cols.append(hi) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "cumfreq": if nf == 0 and mf == 0: cum, low_range, binsize, ex = stats.cumfreq( list(map(float, sample_one)), args.b ) else: cum, low_range, binsize, ex = stats.cumfreq( list(map(float, sample_one)), args.b, (mf, nf) ) cols.append(cum) cols.append(low_range) cols.append(binsize) cols.append(ex) elif test_id.strip() == "boxcox_normmax": if nf == 0 and mf == 0: ma = stats.boxcox_normmax(list(map(float, sample_one))) else: ma = stats.boxcox_normmax( list(map(float, sample_one)), (mf, nf), method=args.method ) cols.append(ma) elif test_id.strip() == "boxcox": if imbda == 0: box, ma, ci = stats.boxcox( list(map(float, sample_one)), alpha=args.alpha ) cols.append(box) cols.append(ma) cols.append(ci) else: box = stats.boxcox( list(map(float, sample_one)), imbda, alpha=args.alpha ) cols.append(box) elif test_id.strip() == "histogram2": h2 = stats.histogram2( list(map(float, sample_one)), list(map(float, sample_two)) ) for i in h2: cols.append(i) elif test_id.strip() == "ranksums": z_statistic, p_value = stats.ranksums( list(map(float, sample_one)), list(map(float, sample_two)) ) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == "ttest_1samp": t, prob = stats.ttest_1samp(map(float, sample_one), map(float, sample_two)) for i in t: cols.append(i) for i in prob: cols.append(i) elif test_id.strip() == "ansari": AB, p_value = stats.ansari( list(map(float, sample_one)), list(map(float, sample_two)) ) cols.append(AB) cols.append(p_value) elif test_id.strip() == "linregress": slope, intercept, r_value, p_value, stderr = stats.linregress( list(map(float, sample_one)), list(map(float, sample_two)) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == "pearsonr": cor, p_value = stats.pearsonr( list(map(float, sample_one)), list(map(float, sample_two)) ) cols.append(cor) cols.append(p_value) elif test_id.strip() == "pointbiserialr": r, p_value = stats.pointbiserialr( list(map(float, sample_one)), list(map(float, sample_two)) ) cols.append(r) cols.append(p_value) elif test_id.strip() == "ks_2samp": d, p_value = stats.ks_2samp( list(map(float, sample_one)), list(map(float, sample_two)) ) cols.append(d) cols.append(p_value) elif test_id.strip() == "mannwhitneyu": mw_stats_u, p_value = stats.mannwhitneyu( list(map(float, sample_one)), list(map(float, sample_two)), use_continuity=args.mwu_use_continuity, ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "zmap": z = stats.zmap( list(map(float, sample_one)), list(map(float, sample_two)), ddof=args.ddof, ) for i in z: cols.append(i) elif test_id.strip() == "ttest_ind": mw_stats_u, p_value = stats.ttest_ind( list(map(float, sample_one)), list(map(float, sample_two)), equal_var=args.equal_var, ) cols.append(mw_stats_u) cols.append(p_value) elif test_id.strip() == "ttest_rel": t, prob = stats.ttest_rel( list(map(float, sample_one)), list(map(float, sample_two)), axis=args.axis, ) cols.append(t) cols.append(prob) elif test_id.strip() == "mood": z, p_value = stats.mood( list(map(float, sample_one)), list(map(float, sample_two)), axis=args.axis, ) cols.append(z) cols.append(p_value) elif test_id.strip() == "shapiro": W, p_value = stats.shapiro(list(map(float, sample_one))) cols.append(W) cols.append(p_value) elif test_id.strip() == "kendalltau": k, p_value = stats.kendalltau( list(map(float, sample_one)), list(map(float, sample_two)), initial_lexsort=args.initial_lexsort, ) cols.append(k) cols.append(p_value) elif test_id.strip() == "entropy": s = stats.entropy( list(map(float, sample_one)), list(map(float, sample_two)), base=args.base, ) cols.append(s) elif test_id.strip() == "spearmanr": if sample2 == 1: rho, p_value = stats.spearmanr( list(map(float, sample_one)), list(map(float, sample_two)) ) else: rho, p_value = stats.spearmanr(list(map(float, sample_one))) cols.append(rho) cols.append(p_value) elif test_id.strip() == "wilcoxon": if sample2 == 1: T, p_value = stats.wilcoxon( list(map(float, sample_one)), list(map(float, sample_two)), zero_method=args.zero_method, correction=args.correction, ) else: T, p_value = stats.wilcoxon( list(map(float, sample_one)), zero_method=args.zero_method, correction=args.correction, ) cols.append(T) cols.append(p_value) elif test_id.strip() == "chisquare": if sample2 == 1: rho, p_value = stats.chisquare( list(map(float, sample_one)), list(map(float, sample_two)), ddof=args.ddof, ) else: rho, p_value = stats.chisquare( list(map(float, sample_one)), ddof=args.ddof ) cols.append(rho) cols.append(p_value) elif test_id.strip() == "power_divergence": if sample2 == 1: stat, p_value = stats.power_divergence( list(map(float, sample_one)), list(map(float, sample_two)), ddof=args.ddof, lambda_=args.lambda_, ) else: stat, p_value = stats.power_divergence( list(map(float, sample_one)), ddof=args.ddof, lambda_=args.lambda_ ) cols.append(stat) cols.append(p_value) elif test_id.strip() == "theilslopes": if sample2 == 1: mpe, met, lo, up = stats.theilslopes( list(map(float, sample_one)), list(map(float, sample_two)), alpha=args.alpha, ) else: mpe, met, lo, up = stats.theilslopes( list(map(float, sample_one)), alpha=args.alpha ) cols.append(mpe) cols.append(met) cols.append(lo) cols.append(up) elif test_id.strip() == "combine_pvalues": if sample2 == 1: stat, p_value = stats.combine_pvalues( list(map(float, sample_one)), method=args.med, weights=list(map(float, sample_two)), ) else: stat, p_value = stats.combine_pvalues( list(map(float, sample_one)), method=args.med ) cols.append(stat) cols.append(p_value) elif test_id.strip() == "obrientransform": ob = stats.obrientransform(*b_samples) for i in ob: elements = ",".join(list(map(str, i))) cols.append(elements) elif test_id.strip() == "f_oneway": f_value, p_value = stats.f_oneway(*b_samples) cols.append(f_value) cols.append(p_value) elif test_id.strip() == "kruskal": h, p_value = stats.kruskal(*b_samples) cols.append(h) cols.append(p_value) elif test_id.strip() == "friedmanchisquare": fr, p_value = stats.friedmanchisquare(*b_samples) cols.append(fr) cols.append(p_value) elif test_id.strip() == "fligner": xsq, p_value = stats.fligner( center=args.center, proportiontocut=args.proportiontocut, *b_samples ) cols.append(xsq) cols.append(p_value) elif test_id.strip() == "bartlett": T, p_value = stats.bartlett(*b_samples) cols.append(T) cols.append(p_value) elif test_id.strip() == "levene": w, p_value = stats.levene( center=args.center, proportiontocut=args.proportiontocut, *b_samples ) cols.append(w) cols.append(p_value) elif test_id.strip() == "median_test": stat, p_value, m, table = stats.median_test( ties=args.ties, correction=args.correction, lambda_=args.lambda_, *b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for i in table: elements = ",".join(list(map(str, i))) cols.append(elements) outfile.write("%s\n" % "\t".join(list(map(str, cols)))) outfile.close() if __name__ == "__main__": main()