Mercurial > repos > brigidar > vcf_to_snp
view vcf_snp.py @ 0:75cedeb179aa draft
Uploaded
author | brigidar |
---|---|
date | Mon, 02 Nov 2015 12:45:11 -0500 |
parents | |
children | ea2f686dfd4a |
line wrap: on
line source
#!/usr/bin/env python ######################################################################################### # # # Name : vcf_snp.py # # Version : 0.1 # # Project : extract snp from vcf # # Description : Script to exctract snps # # Author : Brigida Rusconi # # Date : October 30 2015 # # # ######################################################################################### #for replacement of a given value with NaN #http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value # to remove any symbol and replace it with nan #http://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python # for isin information #http://pandas.pydata.org/pandas-docs/stable/indexing.html # for selecting rows that have an indel: #http://stackoverflow.com/questions/14247586/python-pandas-how-to-select-rows-with-one-or-more-nulls-from-a-dataframe-without #------------------------------------------------------------------------------------------ import argparse, os, sys, csv, IPython import pandas import pdb import numpy as np from pandas import * from IPython import get_ipython import matplotlib.pyplot as plt from pandas.util.testing import assert_frame_equal #------------------------------------------------------------------------------------------ #output and input file name to give with the script parser = argparse.ArgumentParser() parser.add_argument('-o', '--output', help="snp tab") parser.add_argument('-s', '--snp_table', help="vcf") args = parser.parse_args() output_file = args.output input_file = args.snp_table #------------------------------------------------------------------------------------------ #read in file as dataframe df =read_csv(input_file,sep='\t', dtype=object) df=df.set_index(['#CHROM','POS']) #need to fill na otherwise it cannot do boolean operations df=df.fillna('--') print "vcf " + str(df.index.size) #------------------------------------------------------------------------------------------ # only columns with qbase and refbase in table count_qbase=list(df.columns.values) qindexes=[] for i, v in enumerate(count_qbase): if 'ALT' in v: qindexes.append(i) df2=df.iloc[:,qindexes] #------------------------------------------------------------------------------------------ #pdb.set_trace() #------------------------------------------------------------------------------------------ #save file with output name for fasta -o option and removes header and index with open(output_file,'w') as output: df2.T.to_csv(output, sep='\t',header=False) #------------------------------------------------------------------------------------------