Mercurial > repos > jay > pdaug_word_vector_descriptor
view PDAUG_Word_Vector_Model/PDAUG_Word_Vector_Model.py @ 1:a196bbbc7471 draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit 3c91f421d26c8f42cf2671e47db735d2cf69dde8"
author | jay |
---|---|
date | Tue, 29 Dec 2020 04:40:30 +0000 |
parents | a3a1d9bea1ad |
children | 8de738fa6552 |
line wrap: on
line source
import nltk from nltk import trigrams import pandas as pd from Bio import SeqIO import gensim, logging import argparse parser = argparse.ArgumentParser() parser.add_argument("-I", "--Input", required=True, default=None, help="Path to target fasta file") parser.add_argument("-M", "--min_count", required=False, default=0, help="Path to target tsv file") parser.add_argument("-W", "--window", required=False, default=5, help="Path to target tsv file") parser.add_argument("-O", "--OutFile", required=False, default='model.txt', help="Path to target tsv file") args = parser.parse_args() class ProteinSeq(object): def __init__(self): pass def __iter__(self): for index, record in enumerate(SeqIO.parse(args.Input, 'fasta')): for loop_num in range(0, 3): Ngram_list = [] tri_tokens = trigrams(record.seq) for index1, item in enumerate(tri_tokens): if index1 % 3 == loop_num: tri_pep = item[0] + item[1] + item[2] Ngram_list.append(tri_pep) yield Ngram_list #min_count = 0 size = 200 #window = 5 sg = 1 sentences = ProteinSeq() model = gensim.models.Word2Vec(sentences, min_count=int(args.min_count), size=size, window=int(args.window), sg = sg, workers = 10) model.wv.save_word2vec_format(args.OutFile, binary=False)