diff vcf_snp.py @ 0:75cedeb179aa draft

Uploaded
author brigidar
date Mon, 02 Nov 2015 12:45:11 -0500
parents
children ea2f686dfd4a
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_snp.py	Mon Nov 02 12:45:11 2015 -0500
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+#########################################################################################
+#											#
+# Name	      :	vcf_snp.py								#
+# Version     : 0.1									#
+# Project     : extract snp from vcf						#
+# Description : Script to exctract snps		#
+# Author      : Brigida Rusconi								#
+# Date        : October 30 2015							#
+#											#
+#########################################################################################
+#for replacement of a given value with NaN
+#http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value
+
+# to remove any symbol and replace it with nan
+#http://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python
+
+# for isin information
+#http://pandas.pydata.org/pandas-docs/stable/indexing.html
+
+# for selecting rows that have an indel:
+#http://stackoverflow.com/questions/14247586/python-pandas-how-to-select-rows-with-one-or-more-nulls-from-a-dataframe-without
+
+
+
+#------------------------------------------------------------------------------------------
+
+
+import argparse, os, sys, csv, IPython
+import pandas
+import pdb
+import numpy as np
+from pandas import *
+from IPython import get_ipython
+import matplotlib.pyplot as plt
+from pandas.util.testing import assert_frame_equal
+#------------------------------------------------------------------------------------------
+
+
+#output and input file name to give with the script
+parser = argparse.ArgumentParser()
+
+parser.add_argument('-o', '--output', help="snp tab")
+parser.add_argument('-s', '--snp_table', help="vcf")
+
+
+args = parser.parse_args()
+output_file = args.output
+input_file = args.snp_table
+#------------------------------------------------------------------------------------------
+
+
+#read in file as dataframe
+df =read_csv(input_file,sep='\t', dtype=object)
+df=df.set_index(['#CHROM','POS'])
+
+#need to fill na otherwise it cannot do boolean operations
+df=df.fillna('--')
+print "vcf " + str(df.index.size)
+#------------------------------------------------------------------------------------------
+
+# only columns with qbase and refbase in table
+count_qbase=list(df.columns.values)
+qindexes=[]
+for i, v in enumerate(count_qbase):
+    if 'ALT' in v:
+        qindexes.append(i)
+df2=df.iloc[:,qindexes]
+
+#------------------------------------------------------------------------------------------
+#pdb.set_trace()
+
+#------------------------------------------------------------------------------------------
+
+#save file with output name for fasta -o option and removes header and index
+with open(output_file,'w') as output:
+    df2.T.to_csv(output, sep='\t',header=False)
+#------------------------------------------------------------------------------------------
+
+
+