Repository 'vcf_to_snp'
hg clone https://toolshed.g2.bx.psu.edu/repos/brigidar/vcf_to_snp

Changeset 2:ea2f686dfd4a (2015-11-02)
Previous changeset 1:032d2c8cf8ae (2015-11-02) Next changeset 3:a928631d8297 (2015-11-02)
Commit message:
Uploaded
modified:
vcf_snp.py
b
diff -r 032d2c8cf8ae -r ea2f686dfd4a vcf_snp.py
--- a/vcf_snp.py Mon Nov 02 12:45:32 2015 -0500
+++ b/vcf_snp.py Mon Nov 02 19:30:43 2015 -0500
[
@@ -3,25 +3,15 @@
 #########################################################################################
 # #
 # Name       : vcf_snp.py #
-# Version     : 0.1 #
+# Version     : 0.2 #
 # Project     : extract snp from vcf #
 # Description : Script to exctract snps #
 # Author      : Brigida Rusconi #
-# Date        : October 30 2015 #
+# Date        : November 02 2015 #
 # #
 #########################################################################################
-#for replacement of a given value with NaN
-#http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value
 
-# to remove any symbol and replace it with nan
-#http://stackoverflow.com/questions/875968/how-to-remove-symbols-from-a-string-with-python
-
-# for isin information
-#http://pandas.pydata.org/pandas-docs/stable/indexing.html
-
-# for selecting rows that have an indel:
-#http://stackoverflow.com/questions/14247586/python-pandas-how-to-select-rows-with-one-or-more-nulls-from-a-dataframe-without
-
+# If the position is identical to the reference it does not print the nucleotide. I have to retrieve it from the ref column.
 
 
 #------------------------------------------------------------------------------------------
@@ -53,30 +43,30 @@
 
 #read in file as dataframe
 df =read_csv(input_file,sep='\t', dtype=object)
-df=df.set_index(['#CHROM','POS'])
 
-#need to fill na otherwise it cannot do boolean operations
-df=df.fillna('--')
-print "vcf " + str(df.index.size)
 #------------------------------------------------------------------------------------------
 
 # only columns with qbase and refbase in table
-count_qbase=list(df.columns.values)
-qindexes=[]
-for i, v in enumerate(count_qbase):
-    if 'ALT' in v:
-        qindexes.append(i)
-df2=df.iloc[:,qindexes]
-
-#------------------------------------------------------------------------------------------
+#count_qbase=list(df.columns.values)
+#qindexes=[]
+#for i, v in enumerate(count_qbase):
+#    if 'ALT' in v:
+#        qindexes.append(i)
+df2=df.iloc[:,3:5]
 #pdb.set_trace()
 
 #------------------------------------------------------------------------------------------
-
+ref_list=[]
+for i in range(0,df2.index.size):
+    if df2.iloc[i,1]==".":
+        ref_list.append(df2.iloc[i,0][0])
+    else:
+        ref_list.append(df2.iloc[i,1][0])
+#pdb.set_trace()
+#
+##------------------------------------------------------------------------------------------
+#
 #save file with output name for fasta -o option and removes header and index
 with open(output_file,'w') as output:
-    df2.T.to_csv(output, sep='\t',header=False)
-#------------------------------------------------------------------------------------------
-
-
-
+    output.write(df.columns.values[4] + '\t' + ''.join([str(i) for v,i in enumerate(ref_list)]))
+##------------------------------------------------------------------------------------------
\ No newline at end of file