comparison PDAUG_Word_Vector_Model/PDAUG_Word_Vector_Model.py @ 0:1199b572b86c draft

"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author jay
date Wed, 28 Oct 2020 02:40:51 +0000
parents
children 87b621620d52
comparison
equal deleted inserted replaced
-1:000000000000 0:1199b572b86c
1 import nltk
2 from nltk import trigrams
3 import pandas as pd
4 from Bio import SeqIO
5 import gensim, logging
6 import argparse
7
8 parser = argparse.ArgumentParser()
9
10 parser.add_argument("-I", "--Input", required=True, default=None, help="Path to target fasta file")
11 parser.add_argument("-M", "--min_count", required=False, default=0, help="Path to target tsv file")
12 parser.add_argument("-W", "--window", required=False, default=5, help="Path to target tsv file")
13 parser.add_argument("-O", "--OutFile", required=False, default='model.txt', help="Path to target tsv file")
14
15 args = parser.parse_args()
16
17 class ProteinSeq(object):
18 def __init__(self):
19 pass
20 def __iter__(self):
21 for index, record in enumerate(SeqIO.parse(args.Input, 'fasta')):
22 for loop_num in range(0, 3):
23 Ngram_list = []
24 tri_tokens = trigrams(record.seq)
25 for index1, item in enumerate(tri_tokens):
26 if index1 % 3 == loop_num:
27 tri_pep = item[0] + item[1] + item[2]
28 Ngram_list.append(tri_pep)
29 yield Ngram_list
30 #min_count = 0
31 size = 200
32 #window = 5
33 sg = 1
34
35 sentences = ProteinSeq()
36 model = gensim.models.Word2Vec(sentences, min_count=int(args.min_count), size=size, window=int(args.window), sg = sg, workers = 10)
37 model.wv.save_word2vec_format(args.OutFile, binary=False)
38