Mercurial > repos > brigidar > vcf_to_snp
diff vcf_snp.py @ 0:75cedeb179aa draft
Uploaded
author | brigidar |
---|---|
date | Mon, 02 Nov 2015 12:45:11 -0500 |
parents | |
children | ea2f686dfd4a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_snp.py Mon Nov 02 12:45:11 2015 -0500 @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +######################################################################################### +# # +# Name : vcf_snp.py # +# Version : 0.1 # +# Project : extract snp from vcf # +# Description : Script to exctract snps # +# Author : Brigida Rusconi # +# Date : October 30 2015 # +# # +######################################################################################### +#for replacement of a given value with NaN +#http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value + +# to remove any symbol and replace it with nan +#http://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python + +# for isin information +#http://pandas.pydata.org/pandas-docs/stable/indexing.html + +# for selecting rows that have an indel: +#http://stackoverflow.com/questions/14247586/python-pandas-how-to-select-rows-with-one-or-more-nulls-from-a-dataframe-without + + + +#------------------------------------------------------------------------------------------ + + +import argparse, os, sys, csv, IPython +import pandas +import pdb +import numpy as np +from pandas import * +from IPython import get_ipython +import matplotlib.pyplot as plt +from pandas.util.testing import assert_frame_equal +#------------------------------------------------------------------------------------------ + + +#output and input file name to give with the script +parser = argparse.ArgumentParser() + +parser.add_argument('-o', '--output', help="snp tab") +parser.add_argument('-s', '--snp_table', help="vcf") + + +args = parser.parse_args() +output_file = args.output +input_file = args.snp_table +#------------------------------------------------------------------------------------------ + + +#read in file as dataframe +df =read_csv(input_file,sep='\t', dtype=object) +df=df.set_index(['#CHROM','POS']) + +#need to fill na otherwise it cannot do boolean operations +df=df.fillna('--') +print "vcf " + str(df.index.size) +#------------------------------------------------------------------------------------------ + +# only columns with qbase and refbase in table +count_qbase=list(df.columns.values) +qindexes=[] +for i, v in enumerate(count_qbase): + if 'ALT' in v: + qindexes.append(i) +df2=df.iloc[:,qindexes] + +#------------------------------------------------------------------------------------------ +#pdb.set_trace() + +#------------------------------------------------------------------------------------------ + +#save file with output name for fasta -o option and removes header and index +with open(output_file,'w') as output: + df2.T.to_csv(output, sep='\t',header=False) +#------------------------------------------------------------------------------------------ + + +