diff PDAUG_Word_Vector_Descriptor/PDAUG_Word_Vector_Descriptor.py @ 0:7557b48b2872 draft

"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author jay
date Wed, 28 Oct 2020 02:10:12 +0000
parents
children 5d2fee54cedd
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PDAUG_Word_Vector_Descriptor/PDAUG_Word_Vector_Descriptor.py	Wed Oct 28 02:10:12 2020 +0000
@@ -0,0 +1,59 @@
+import numpy as np
+import os
+import pandas as pd
+from Bio import SeqIO
+from nltk import bigrams
+from nltk import trigrams
+import gensim
+import argparse
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("-M", "--ModelInput", required=True, default=None, help="Path to target tsv file")
+parser.add_argument("-R", "--row", required=True, default=None, help="Path to target tsv file")
+parser.add_argument("-I", "--InputFasta", required=True, default=6, help="Path to target tsv file")
+parser.add_argument("-O", "--OutFile", required=False, default='model.txt', help="Path to target tsv file")
+parser.add_argument("-P", "--positive", required=True, help="Path to target tsv file")
+parser.add_argument("-N", "--negative", required=True, help="Path to target tsv file")
+
+args = parser.parse_args()
+
+seed = 42
+np.random.seed(seed)
+
+new_model = gensim.models.KeyedVectors.load_word2vec_format(args.ModelInput, binary=False)
+
+import time
+t0 = time.time()
+
+temp_word = np.zeros(shape=(int(args.row), 200))
+
+for index, seqs in enumerate(SeqIO.parse(args.InputFasta, 'fasta')):
+    seq_sum = 0
+    tri_seq = trigrams(seqs.seq)
+    for item in ((tri_seq)):
+        tri_str = item[0] + item[1] + item[2]
+        if tri_str not in list(new_model.wv.vocab):
+            continue
+        seq_sum = seq_sum + new_model[tri_str]
+
+    temp_word[index] = seq_sum
+
+t1 = time.time()
+
+
+temp_word = temp_word
+
+
+clm = [x for x in range(0,temp_word.shape[1])]
+y_temp_word = np.vstack((np.ones((int(args.positive), 1)), np.zeros((int(args.negative),1))))
+
+c, r = y_temp_word.shape
+y_temp_word = y_temp_word.reshape(c,)
+
+class_label = pd.DataFrame(y_temp_word, columns=["Class_label"])
+
+df = pd.DataFrame(temp_word, columns=clm)
+df = pd.concat([df,class_label], axis=1)
+
+df.to_csv(args.OutFile, index=None, sep="\t")
\ No newline at end of file