annotate PDAUG_Word_Vector_Model/PDAUG_Word_Vector_Model.py @ 5:658dcb7f39f9 draft

"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit ac4353ca5c0ac9ce60df9f4bf160ed08b99fbee3"
author jay
date Thu, 28 Jan 2021 04:32:15 +0000
parents 2df11ea23f10
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
1 import nltk
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
2 from nltk import trigrams
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
3 import pandas as pd
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
4 from Bio import SeqIO
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
5 import gensim, logging
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
6 import argparse
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
7
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
8 parser = argparse.ArgumentParser()
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
9
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
10 parser.add_argument("-I", "--Input", required=True, default=None, help="Path to target fasta file")
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
11 parser.add_argument("-M", "--min_count", required=False, default=0, help="Path to target tsv file")
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
12 parser.add_argument("-W", "--window", required=False, default=5, help="Path to target tsv file")
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
13 parser.add_argument("-O", "--OutFile", required=False, default='model.txt', help="Path to target tsv file")
5
658dcb7f39f9 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit ac4353ca5c0ac9ce60df9f4bf160ed08b99fbee3"
jay
parents: 0
diff changeset
14 parser.add_argument("-S", "--SG", required=False, default='skip-gram', help="Training algorithm: 1 for skip-gram; otherwise CBOW")
0
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
15
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
16 args = parser.parse_args()
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
17
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
18 class ProteinSeq(object):
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
19 def __init__(self):
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
20 pass
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
21 def __iter__(self):
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
22 for index, record in enumerate(SeqIO.parse(args.Input, 'fasta')):
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
23 for loop_num in range(0, 3):
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
24 Ngram_list = []
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
25 tri_tokens = trigrams(record.seq)
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
26 for index1, item in enumerate(tri_tokens):
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
27 if index1 % 3 == loop_num:
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
28 tri_pep = item[0] + item[1] + item[2]
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
29 Ngram_list.append(tri_pep)
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
30 yield Ngram_list
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
31 #min_count = 0
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
32 size = 200
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
33 #window = 5
5
658dcb7f39f9 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit ac4353ca5c0ac9ce60df9f4bf160ed08b99fbee3"
jay
parents: 0
diff changeset
34
658dcb7f39f9 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit ac4353ca5c0ac9ce60df9f4bf160ed08b99fbee3"
jay
parents: 0
diff changeset
35 print (args.SG)
658dcb7f39f9 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit ac4353ca5c0ac9ce60df9f4bf160ed08b99fbee3"
jay
parents: 0
diff changeset
36 if args.SG == 'skip-gram':
658dcb7f39f9 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit ac4353ca5c0ac9ce60df9f4bf160ed08b99fbee3"
jay
parents: 0
diff changeset
37 SG = 1
658dcb7f39f9 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit ac4353ca5c0ac9ce60df9f4bf160ed08b99fbee3"
jay
parents: 0
diff changeset
38 elif args.SG == 'CBOW':
658dcb7f39f9 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit ac4353ca5c0ac9ce60df9f4bf160ed08b99fbee3"
jay
parents: 0
diff changeset
39 SG = 0
0
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
40
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
41 sentences = ProteinSeq()
5
658dcb7f39f9 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit ac4353ca5c0ac9ce60df9f4bf160ed08b99fbee3"
jay
parents: 0
diff changeset
42 model = gensim.models.Word2Vec(sentences, min_count=int(args.min_count), size=size, window=int(args.window), sg = SG, workers = 10)
0
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
43 model.wv.save_word2vec_format(args.OutFile, binary=False)
2df11ea23f10 "planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
jay
parents:
diff changeset
44