Mercurial > repos > brigidar > vcf_to_snp
comparison vcf_snp.py @ 6:3a352cb57117 draft
new script updates
author | brigidar |
---|---|
date | Fri, 06 Nov 2015 15:12:17 -0500 |
parents | 866cd9ce1cbd |
children | e9dcf64ef8ec |
comparison
equal
deleted
inserted
replaced
5:d5725351bf22 | 6:3a352cb57117 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 ##!/usr/local/anaconda/bin/python | |
2 | 3 |
3 ######################################################################################### | 4 ######################################################################################### |
4 # # | 5 # # |
5 # Name : vcf_snp.py # | 6 # Name : vcf_snp.py # |
6 # Version : 0.2 # | 7 # Version : 0.3 # |
7 # Project : extract snp from vcf # | 8 # Project : extract snp from vcf # |
8 # Description : Script to exctract snps # | 9 # Description : Script to exctract snps # |
9 # Author : Brigida Rusconi # | 10 # Author : Brigida Rusconi # |
10 # Date : November 02 2015 # | 11 # Date : November 06 2015 # |
11 # # | 12 # # |
12 ######################################################################################### | 13 ######################################################################################### |
13 | 14 |
14 # If the position is identical to the reference it does not print the nucleotide. I have to retrieve it from the ref column. | 15 # If the position is identical to the reference it does not print the nucleotide. I have to retrieve it from the ref column. |
15 | 16 |
28 #output and input file name to give with the script | 29 #output and input file name to give with the script |
29 parser = argparse.ArgumentParser() | 30 parser = argparse.ArgumentParser() |
30 | 31 |
31 parser.add_argument('-o', '--output', help="snp tab") | 32 parser.add_argument('-o', '--output', help="snp tab") |
32 parser.add_argument('-s', '--snp_table', help="vcf") | 33 parser.add_argument('-s', '--snp_table', help="vcf") |
34 parser.add_argument('-p', '--positions', help="positions") | |
35 | |
33 | 36 |
34 | 37 |
35 args = parser.parse_args() | 38 args = parser.parse_args() |
36 output_file = args.output | 39 output_file = args.output |
37 input_file = args.snp_table | 40 input_file = args.snp_table |
41 input_file2=args.positions | |
38 #------------------------------------------------------------------------------------------ | 42 #------------------------------------------------------------------------------------------ |
39 | 43 |
40 | 44 |
41 #read in file as dataframe | 45 #read in file as dataframe |
42 df =read_csv(input_file,sep='\t', dtype=object) | 46 df =read_csv(input_file,sep='\t', dtype=object) |
47 df3=read_csv(input_file2,sep='\t', dtype=object, names=['#CHROM','POS']) | |
48 #pdb.set_trace() | |
49 #get the positions that are missing | |
50 if df.index.size!= df3.index.size: | |
51 df4=df3[~df3.POS.isin(df.POS)] | |
52 #pdb.set_trace() | |
53 df4=df4.set_index('POS') | |
54 df=df.set_index('POS') | |
55 df2=concat([df,df4], axis=1) | |
56 #if there is no information for a loci the position is dropped so it fills the nan with N | |
57 df5=df2.iloc[:, 2:4].fillna('N') | |
58 df5=df5.reindex(df3.POS) | |
59 else: | |
60 df5=df.iloc[:,3:5] | |
43 | 61 |
44 #------------------------------------------------------------------------------------------ | 62 #------------------------------------------------------------------------------------------ |
45 | 63 |
46 # only columns with qbase and refbase in table | 64 |
47 #count_qbase=list(df.columns.values) | |
48 #qindexes=[] | |
49 #for i, v in enumerate(count_qbase): | |
50 # if 'ALT' in v: | |
51 # qindexes.append(i) | |
52 df2=df.iloc[:,3:5] | |
53 #pdb.set_trace() | 65 #pdb.set_trace() |
54 | 66 |
55 #------------------------------------------------------------------------------------------ | 67 #------------------------------------------------------------------------------------------ |
68 #replaces the . with the nucleotide call of the reference also deals with multiallelic states calling them N | |
56 ref_list=[] | 69 ref_list=[] |
57 for i in range(0,df2.index.size): | 70 for i in range(0,df5.index.size): |
58 if df2.iloc[i,1]==".": | 71 if df5.iloc[i,1]==".": |
59 ref_list.append(df2.iloc[i,0][0]) | 72 ref_list.append(df5.iloc[i,0][0]) |
73 elif len(df5.iloc[i,1])>1: | |
74 ref_list.append('N') | |
60 else: | 75 else: |
61 ref_list.append(df2.iloc[i,1][0]) | 76 ref_list.append(df5.iloc[i,1][0]) |
62 #pdb.set_trace() | 77 #pdb.set_trace() |
63 # | 78 # |
64 ##------------------------------------------------------------------------------------------ | 79 ##------------------------------------------------------------------------------------------ |
65 # | 80 # |
66 #save file with output name for fasta -o option and removes header and index | 81 #save file with output name for fasta -o option and removes header and index |
67 with open(output_file,'w') as output: | 82 with open(output_file,'w') as output: |
68 output.write(df.columns.values[4] + '\t' + ''.join([str(i) for v,i in enumerate(ref_list)])) | 83 output.write('ALT' + '\t' + ''.join([str(i) for v,i in enumerate(ref_list)])) |
69 ##------------------------------------------------------------------------------------------ | 84 ##------------------------------------------------------------------------------------------ |