view vcf_snp.py @ 4:866cd9ce1cbd draft

update anaconda python
author brigidar
date Mon, 02 Nov 2015 19:42:35 -0500
parents ea2f686dfd4a
children 3a352cb57117
line wrap: on
line source

#!/usr/bin/env python

#########################################################################################
#											#
# Name	      :	vcf_snp.py								#
# Version     : 0.2									#
# Project     : extract snp from vcf						#
# Description : Script to exctract snps		#
# Author      : Brigida Rusconi								#
# Date        : November 02 2015							#
#											#
#########################################################################################

# If the position is identical to the reference it does not print the nucleotide. I have to retrieve it from the ref column.


#------------------------------------------------------------------------------------------


import argparse, os, sys, csv, IPython
import pandas
import pdb
from pandas import *

#------------------------------------------------------------------------------------------


#output and input file name to give with the script
parser = argparse.ArgumentParser()

parser.add_argument('-o', '--output', help="snp tab")
parser.add_argument('-s', '--snp_table', help="vcf")


args = parser.parse_args()
output_file = args.output
input_file = args.snp_table
#------------------------------------------------------------------------------------------


#read in file as dataframe
df =read_csv(input_file,sep='\t', dtype=object)

#------------------------------------------------------------------------------------------

# only columns with qbase and refbase in table
#count_qbase=list(df.columns.values)
#qindexes=[]
#for i, v in enumerate(count_qbase):
#    if 'ALT' in v:
#        qindexes.append(i)
df2=df.iloc[:,3:5]
#pdb.set_trace()

#------------------------------------------------------------------------------------------
ref_list=[]
for i in range(0,df2.index.size):
    if df2.iloc[i,1]==".":
        ref_list.append(df2.iloc[i,0][0])
    else:
        ref_list.append(df2.iloc[i,1][0])
#pdb.set_trace()
#
##------------------------------------------------------------------------------------------
#
#save file with output name for fasta -o option and removes header and index
with open(output_file,'w') as output:
    output.write(df.columns.values[4] + '\t' + ''.join([str(i) for v,i in enumerate(ref_list)]))
##------------------------------------------------------------------------------------------