comparison vcf_snp.py @ 2:ea2f686dfd4a draft

Uploaded
author brigidar
date Mon, 02 Nov 2015 19:30:43 -0500
parents 75cedeb179aa
children 866cd9ce1cbd
comparison
equal deleted inserted replaced
1:032d2c8cf8ae 2:ea2f686dfd4a
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 ######################################################################################### 3 #########################################################################################
4 # # 4 # #
5 # Name : vcf_snp.py # 5 # Name : vcf_snp.py #
6 # Version : 0.1 # 6 # Version : 0.2 #
7 # Project : extract snp from vcf # 7 # Project : extract snp from vcf #
8 # Description : Script to exctract snps # 8 # Description : Script to exctract snps #
9 # Author : Brigida Rusconi # 9 # Author : Brigida Rusconi #
10 # Date : October 30 2015 # 10 # Date : November 02 2015 #
11 # # 11 # #
12 ######################################################################################### 12 #########################################################################################
13 #for replacement of a given value with NaN
14 #http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value
15 13
16 # to remove any symbol and replace it with nan 14 # If the position is identical to the reference it does not print the nucleotide. I have to retrieve it from the ref column.
17 #http://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python
18
19 # for isin information
20 #http://pandas.pydata.org/pandas-docs/stable/indexing.html
21
22 # for selecting rows that have an indel:
23 #http://stackoverflow.com/questions/14247586/python-pandas-how-to-select-rows-with-one-or-more-nulls-from-a-dataframe-without
24
25 15
26 16
27 #------------------------------------------------------------------------------------------ 17 #------------------------------------------------------------------------------------------
28 18
29 19
51 #------------------------------------------------------------------------------------------ 41 #------------------------------------------------------------------------------------------
52 42
53 43
54 #read in file as dataframe 44 #read in file as dataframe
55 df =read_csv(input_file,sep='\t', dtype=object) 45 df =read_csv(input_file,sep='\t', dtype=object)
56 df=df.set_index(['#CHROM','POS'])
57 46
58 #need to fill na otherwise it cannot do boolean operations
59 df=df.fillna('--')
60 print "vcf " + str(df.index.size)
61 #------------------------------------------------------------------------------------------ 47 #------------------------------------------------------------------------------------------
62 48
63 # only columns with qbase and refbase in table 49 # only columns with qbase and refbase in table
64 count_qbase=list(df.columns.values) 50 #count_qbase=list(df.columns.values)
65 qindexes=[] 51 #qindexes=[]
66 for i, v in enumerate(count_qbase): 52 #for i, v in enumerate(count_qbase):
67 if 'ALT' in v: 53 # if 'ALT' in v:
68 qindexes.append(i) 54 # qindexes.append(i)
69 df2=df.iloc[:,qindexes] 55 df2=df.iloc[:,3:5]
70
71 #------------------------------------------------------------------------------------------
72 #pdb.set_trace() 56 #pdb.set_trace()
73 57
74 #------------------------------------------------------------------------------------------ 58 #------------------------------------------------------------------------------------------
75 59 ref_list=[]
60 for i in range(0,df2.index.size):
61 if df2.iloc[i,1]==".":
62 ref_list.append(df2.iloc[i,0][0])
63 else:
64 ref_list.append(df2.iloc[i,1][0])
65 #pdb.set_trace()
66 #
67 ##------------------------------------------------------------------------------------------
68 #
76 #save file with output name for fasta -o option and removes header and index 69 #save file with output name for fasta -o option and removes header and index
77 with open(output_file,'w') as output: 70 with open(output_file,'w') as output:
78 df2.T.to_csv(output, sep='\t',header=False) 71 output.write(df.columns.values[4] + '\t' + ''.join([str(i) for v,i in enumerate(ref_list)]))
79 #------------------------------------------------------------------------------------------ 72 ##------------------------------------------------------------------------------------------
80
81
82