comparison vcf_snp.py @ 6:3a352cb57117 draft

new script updates
author brigidar
date Fri, 06 Nov 2015 15:12:17 -0500
parents 866cd9ce1cbd
children e9dcf64ef8ec
comparison
equal deleted inserted replaced
5:d5725351bf22 6:3a352cb57117
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 ##!/usr/local/anaconda/bin/python
2 3
3 ######################################################################################### 4 #########################################################################################
4 # # 5 # #
5 # Name : vcf_snp.py # 6 # Name : vcf_snp.py #
6 # Version : 0.2 # 7 # Version : 0.3 #
7 # Project : extract snp from vcf # 8 # Project : extract snp from vcf #
8 # Description : Script to exctract snps # 9 # Description : Script to exctract snps #
9 # Author : Brigida Rusconi # 10 # Author : Brigida Rusconi #
10 # Date : November 02 2015 # 11 # Date : November 06 2015 #
11 # # 12 # #
12 ######################################################################################### 13 #########################################################################################
13 14
14 # If the position is identical to the reference it does not print the nucleotide. I have to retrieve it from the ref column. 15 # If the position is identical to the reference it does not print the nucleotide. I have to retrieve it from the ref column.
15 16
28 #output and input file name to give with the script 29 #output and input file name to give with the script
29 parser = argparse.ArgumentParser() 30 parser = argparse.ArgumentParser()
30 31
31 parser.add_argument('-o', '--output', help="snp tab") 32 parser.add_argument('-o', '--output', help="snp tab")
32 parser.add_argument('-s', '--snp_table', help="vcf") 33 parser.add_argument('-s', '--snp_table', help="vcf")
34 parser.add_argument('-p', '--positions', help="positions")
35
33 36
34 37
35 args = parser.parse_args() 38 args = parser.parse_args()
36 output_file = args.output 39 output_file = args.output
37 input_file = args.snp_table 40 input_file = args.snp_table
41 input_file2=args.positions
38 #------------------------------------------------------------------------------------------ 42 #------------------------------------------------------------------------------------------
39 43
40 44
41 #read in file as dataframe 45 #read in file as dataframe
42 df =read_csv(input_file,sep='\t', dtype=object) 46 df =read_csv(input_file,sep='\t', dtype=object)
47 df3=read_csv(input_file2,sep='\t', dtype=object, names=['#CHROM','POS'])
48 #pdb.set_trace()
49 #get the positions that are missing
50 if df.index.size!= df3.index.size:
51 df4=df3[~df3.POS.isin(df.POS)]
52 #pdb.set_trace()
53 df4=df4.set_index('POS')
54 df=df.set_index('POS')
55 df2=concat([df,df4], axis=1)
56 #if there is no information for a loci the position is dropped so it fills the nan with N
57 df5=df2.iloc[:, 2:4].fillna('N')
58 df5=df5.reindex(df3.POS)
59 else:
60 df5=df.iloc[:,3:5]
43 61
44 #------------------------------------------------------------------------------------------ 62 #------------------------------------------------------------------------------------------
45 63
46 # only columns with qbase and refbase in table 64
47 #count_qbase=list(df.columns.values)
48 #qindexes=[]
49 #for i, v in enumerate(count_qbase):
50 # if 'ALT' in v:
51 # qindexes.append(i)
52 df2=df.iloc[:,3:5]
53 #pdb.set_trace() 65 #pdb.set_trace()
54 66
55 #------------------------------------------------------------------------------------------ 67 #------------------------------------------------------------------------------------------
68 #replaces the . with the nucleotide call of the reference also deals with multiallelic states calling them N
56 ref_list=[] 69 ref_list=[]
57 for i in range(0,df2.index.size): 70 for i in range(0,df5.index.size):
58 if df2.iloc[i,1]==".": 71 if df5.iloc[i,1]==".":
59 ref_list.append(df2.iloc[i,0][0]) 72 ref_list.append(df5.iloc[i,0][0])
73 elif len(df5.iloc[i,1])>1:
74 ref_list.append('N')
60 else: 75 else:
61 ref_list.append(df2.iloc[i,1][0]) 76 ref_list.append(df5.iloc[i,1][0])
62 #pdb.set_trace() 77 #pdb.set_trace()
63 # 78 #
64 ##------------------------------------------------------------------------------------------ 79 ##------------------------------------------------------------------------------------------
65 # 80 #
66 #save file with output name for fasta -o option and removes header and index 81 #save file with output name for fasta -o option and removes header and index
67 with open(output_file,'w') as output: 82 with open(output_file,'w') as output:
68 output.write(df.columns.values[4] + '\t' + ''.join([str(i) for v,i in enumerate(ref_list)])) 83 output.write('ALT' + '\t' + ''.join([str(i) for v,i in enumerate(ref_list)]))
69 ##------------------------------------------------------------------------------------------ 84 ##------------------------------------------------------------------------------------------