Mercurial > repos > brigidar > vcf_to_snp

#!/usr/bin/env python

#########################################################################################
#											#
# Name	      :	vcf_snp.py								#
# Version     : 0.1									#
# Project     : extract snp from vcf						#
# Description : Script to exctract snps		#
# Author      : Brigida Rusconi								#
# Date        : October 30 2015							#
#											#
#########################################################################################
#for replacement of a given value with NaN
#http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value

# to remove any symbol and replace it with nan
#http://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python

# for isin information
#http://pandas.pydata.org/pandas-docs/stable/indexing.html

# for selecting rows that have an indel:
#http://stackoverflow.com/questions/14247586/python-pandas-how-to-select-rows-with-one-or-more-nulls-from-a-dataframe-without


#------------------------------------------------------------------------------------------


import argparse, os, sys, csv, IPython
import pandas
import pdb
import numpy as np
from pandas import *
from IPython import get_ipython
import matplotlib.pyplot as plt
from pandas.util.testing import assert_frame_equal
#------------------------------------------------------------------------------------------


#output and input file name to give with the script
parser = argparse.ArgumentParser()

parser.add_argument('-o', '--output', help="snp tab")
parser.add_argument('-s', '--snp_table', help="vcf")


args = parser.parse_args()
output_file = args.output
input_file = args.snp_table
#------------------------------------------------------------------------------------------


#read in file as dataframe
df =read_csv(input_file,sep='\t', dtype=object)
df=df.set_index(['#CHROM','POS'])

#need to fill na otherwise it cannot do boolean operations
df=df.fillna('--')
print "vcf " + str(df.index.size)
#------------------------------------------------------------------------------------------

# only columns with qbase and refbase in table
count_qbase=list(df.columns.values)
qindexes=[]
for i, v in enumerate(count_qbase):
    if 'ALT' in v:
        qindexes.append(i)
df2=df.iloc[:,qindexes]

#------------------------------------------------------------------------------------------
#pdb.set_trace()

#------------------------------------------------------------------------------------------

#save file with output name for fasta -o option and removes header and index
with open(output_file,'w') as output:
    df2.T.to_csv(output, sep='\t',header=False)
#------------------------------------------------------------------------------------------
author	brigidar
date	Mon, 02 Nov 2015 12:45:11 -0500
parents
children	ea2f686dfd4a