Mercurial > repos > brigidar > vcf_to_snp
changeset 2:ea2f686dfd4a draft
Uploaded
author | brigidar |
---|---|
date | Mon, 02 Nov 2015 19:30:43 -0500 |
parents | 032d2c8cf8ae |
children | a928631d8297 |
files | vcf_snp.py |
diffstat | 1 files changed, 21 insertions(+), 31 deletions(-) [+] |
line wrap: on
line diff
--- a/vcf_snp.py Mon Nov 02 12:45:32 2015 -0500 +++ b/vcf_snp.py Mon Nov 02 19:30:43 2015 -0500 @@ -3,25 +3,15 @@ ######################################################################################### # # # Name : vcf_snp.py # -# Version : 0.1 # +# Version : 0.2 # # Project : extract snp from vcf # # Description : Script to exctract snps # # Author : Brigida Rusconi # -# Date : October 30 2015 # +# Date : November 02 2015 # # # ######################################################################################### -#for replacement of a given value with NaN -#http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value -# to remove any symbol and replace it with nan -#http://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python - -# for isin information -#http://pandas.pydata.org/pandas-docs/stable/indexing.html - -# for selecting rows that have an indel: -#http://stackoverflow.com/questions/14247586/python-pandas-how-to-select-rows-with-one-or-more-nulls-from-a-dataframe-without - +# If the position is identical to the reference it does not print the nucleotide. I have to retrieve it from the ref column. #------------------------------------------------------------------------------------------ @@ -53,30 +43,30 @@ #read in file as dataframe df =read_csv(input_file,sep='\t', dtype=object) -df=df.set_index(['#CHROM','POS']) -#need to fill na otherwise it cannot do boolean operations -df=df.fillna('--') -print "vcf " + str(df.index.size) #------------------------------------------------------------------------------------------ # only columns with qbase and refbase in table -count_qbase=list(df.columns.values) -qindexes=[] -for i, v in enumerate(count_qbase): - if 'ALT' in v: - qindexes.append(i) -df2=df.iloc[:,qindexes] - -#------------------------------------------------------------------------------------------ +#count_qbase=list(df.columns.values) +#qindexes=[] +#for i, v in enumerate(count_qbase): +# if 'ALT' in v: +# qindexes.append(i) +df2=df.iloc[:,3:5] #pdb.set_trace() #------------------------------------------------------------------------------------------ - +ref_list=[] +for i in range(0,df2.index.size): + if df2.iloc[i,1]==".": + ref_list.append(df2.iloc[i,0][0]) + else: + ref_list.append(df2.iloc[i,1][0]) +#pdb.set_trace() +# +##------------------------------------------------------------------------------------------ +# #save file with output name for fasta -o option and removes header and index with open(output_file,'w') as output: - df2.T.to_csv(output, sep='\t',header=False) -#------------------------------------------------------------------------------------------ - - - + output.write(df.columns.values[4] + '\t' + ''.join([str(i) for v,i in enumerate(ref_list)])) +##------------------------------------------------------------------------------------------ \ No newline at end of file