Mercurial > repos > jay > pdaug_ml_models
diff PDAUG_Peptide_Ngrams/PDAUG_Peptide_Ngrams.py @ 0:0973f093d98f draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author | jay |
---|---|
date | Wed, 28 Oct 2020 02:31:40 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PDAUG_Peptide_Ngrams/PDAUG_Peptide_Ngrams.py Wed Oct 28 02:31:40 2020 +0000 @@ -0,0 +1,82 @@ +import matplotlib +matplotlib.use('Agg') +import os +import sys +sys.path.insert(0, os.path.abspath('..')) +import quantiprot +from quantiprot.utils.io import load_fasta_file +from quantiprot.utils.feature import Feature, FeatureSet +from quantiprot.metrics.aaindex import get_aa2hydropathy +from quantiprot.metrics.basic import identity +from quantiprot.metrics.ngram import pattern_match, pattern_count +from quantiprot.analysis.ngram import ngram_count +from quantiprot.analysis.ngram import zipf_law_fit +from matplotlib import pyplot as plt + + +def Run_ngrams(fasta1, fasta2, OutFile ): + + alphasyn_seq = load_fasta_file(fasta1) + amyload_pos_seq = load_fasta_file(fasta2) + + fs_aa = FeatureSet("aa patterns") + fs_aa.add(identity) + fs_aa.add(pattern_match, pattern='VT', padded=True) + fs_aa.add(pattern_count, pattern='VT') + + result_seq = fs_aa(alphasyn_seq) + + fs_hp = FeatureSet("hydropathy patterns") + fs_hp.add(Feature(get_aa2hydropathy())) + fs_hp.add(Feature(get_aa2hydropathy()).then(pattern_match, pattern=[0.0, 2.0], + metric='taxi', radius=1.0)) + result_seq2 = fs_hp(alphasyn_seq) + result_freq = ngram_count(alphasyn_seq, n=2) + result_fit = zipf_law_fit(amyload_pos_seq, n=3, verbose=True) + + counts = sorted(result_fit["ngram_counts"], reverse=True) + ranks = range(1, len(counts)+1) + + slope = result_fit["slope"] + harmonic_num = sum([rank**-slope for rank in ranks]) + fitted_counts = [(rank**-slope) / harmonic_num * sum(counts) for rank in ranks] + + plt.plot(ranks, counts, 'k', label="empirical") + plt.plot(ranks, fitted_counts, 'k--', + label="Zipf's law\nslope: {:.2f}".format((slope))) + plt.xlabel('rank') + plt.ylabel('count') + plt.xscale('log') + plt.yscale('log') + plt.legend() + + plt.savefig(OutFile) + +if __name__=="__main__": + + + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("-f1", "--Fasta1", + required=True, + default=None, + help="First fasta file") + + parser.add_argument("-f2", "--Fasta2", + required=True, + default=None, + help="Second fasta file") + + + parser.add_argument("--OutFile", + required=True, + help="HTML out file", + default="report.html") + + + args = parser.parse_args() + + Run_ngrams(args.Fasta1, args.Fasta2, args.OutFile) +