Mercurial > repos > jay > pdaug_sequence_similarity_network
diff PDAUG_Word_Vector_Descriptor/PDAUG_Word_Vector_Descriptor.py @ 0:e650de82bcc7 draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author | jay |
---|---|
date | Wed, 28 Oct 2020 01:50:00 +0000 |
parents | |
children | 5ae3966929db |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PDAUG_Word_Vector_Descriptor/PDAUG_Word_Vector_Descriptor.py Wed Oct 28 01:50:00 2020 +0000 @@ -0,0 +1,59 @@ +import numpy as np +import os +import pandas as pd +from Bio import SeqIO +from nltk import bigrams +from nltk import trigrams +import gensim +import argparse + +parser = argparse.ArgumentParser() + +parser.add_argument("-M", "--ModelInput", required=True, default=None, help="Path to target tsv file") +parser.add_argument("-R", "--row", required=True, default=None, help="Path to target tsv file") +parser.add_argument("-I", "--InputFasta", required=True, default=6, help="Path to target tsv file") +parser.add_argument("-O", "--OutFile", required=False, default='model.txt', help="Path to target tsv file") +parser.add_argument("-P", "--positive", required=True, help="Path to target tsv file") +parser.add_argument("-N", "--negative", required=True, help="Path to target tsv file") + +args = parser.parse_args() + +seed = 42 +np.random.seed(seed) + +new_model = gensim.models.KeyedVectors.load_word2vec_format(args.ModelInput, binary=False) + +import time +t0 = time.time() + +temp_word = np.zeros(shape=(int(args.row), 200)) + +for index, seqs in enumerate(SeqIO.parse(args.InputFasta, 'fasta')): + seq_sum = 0 + tri_seq = trigrams(seqs.seq) + for item in ((tri_seq)): + tri_str = item[0] + item[1] + item[2] + if tri_str not in list(new_model.wv.vocab): + continue + seq_sum = seq_sum + new_model[tri_str] + + temp_word[index] = seq_sum + +t1 = time.time() + + +temp_word = temp_word + + +clm = [x for x in range(0,temp_word.shape[1])] +y_temp_word = np.vstack((np.ones((int(args.positive), 1)), np.zeros((int(args.negative),1)))) + +c, r = y_temp_word.shape +y_temp_word = y_temp_word.reshape(c,) + +class_label = pd.DataFrame(y_temp_word, columns=["Class_label"]) + +df = pd.DataFrame(temp_word, columns=clm) +df = pd.concat([df,class_label], axis=1) + +df.to_csv(args.OutFile, index=None, sep="\t") \ No newline at end of file