# HG changeset patch # User saskia-hiltemann # Date 1446631572 18000 # Node ID 58815aed4ec300e1115bdfab47257b1a301dd08f # Parent ac09a5aaed0ba498d87c0facd80f03f31e3d9005 few bugfixes in VCF-2-variantlist diff -r ac09a5aaed0b -r 58815aed4ec3 vcf2lv.sh --- a/vcf2lv.sh Mon Aug 03 06:00:51 2015 -0400 +++ b/vcf2lv.sh Wed Nov 04 05:06:12 2015 -0500 @@ -19,7 +19,7 @@ count=0; #output new header - print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq" + print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq", "xRef" > "headerline.txt" }{ if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column) @@ -31,6 +31,10 @@ #alt=$5 reflen=length($4) + # excel adds quotes sometimes :s + gsub(/"/,"",ref) + gsub(/"/,"",alt) + # add chr prefix if needed if(substr($1,1,3)!="chr") chromosome="chr"$1 @@ -47,9 +51,9 @@ # determine varType if(length(ref) == 1 && length(alt) == 1) varType="snp" - else if (length(ref) == 1 ) + else if (length(ref) == 1 && substr(ref,1,1)==substr(alt,1,1) ) varType="ins" - else if (length(alt) == 1 ) + else if (length(alt) == 1 && substr(ref,1,1)==substr(alt,1,1) ) varType="del" else varType="sub" @@ -62,21 +66,24 @@ } else if (varType=="ins"){ start=pos - end=pos + end=pos } else if (varType=="del"){ start=pos end=pos+(reflen-1) } else if (varType=="sub"){ - start=pos + start=pos-1 end=pos+(reflen-1) } # remove leading reference base - if (varType!="snp" && substr(ref,1,1)==substr(alt,1,1)){ #subs not mandatory leading reference base :s + if ( varType!="snp" && substr(ref,1,1)==substr(alt,1,1) ){ #subs not mandatory leading reference base :s reference=substr(ref,2) alleleSeq=substr(alt,2) + if (varType =="sub"){ + start+=1 + } } else{ reference=ref @@ -85,14 +92,23 @@ #print output variant(s) - print count, chromosome, start, end, varType, reference, alleleSeq + if(chromosome == "chr1" || chromosome == "chr2" || chromosome == "chr3" || chromosome == "chr4" || chromosome == "chr5" || chromosome == "chr6" || chromosome == "chr7" || chromosome == "chr8" || chromosome == "chr9" || chromosome == "chr10" || chromosome == "chr11" || chromosome == "chr12" || chromosome == "chr13" || chromosome == "chr14" || chromosome == "chr15" || chromosome == "chr16" || chromosome == "chr17" ||chromosome == "chr18" ||chromosome == "chr19" ||chromosome == "chr20" ||chromosome == "chr21" ||chromosome == "chr22" ||chromosome == "chrX" ||chromosome == "chrY" ) + print count, chromosome, start, end, varType, reference, alleleSeq, "" count+=1 } } - }END{}' $vcffile > $outputfile + }END{}' $vcffile > $outputfile.almost + +# due to overlapping variants that we reduce to more canonical forms, variants may have become out of order, so resort to be sure +sort -k2,2V -k3,3g $outputfile.almost > $outputfile.almost2 + +cat headerline.txt $outputfile.almost2 > $outputfile + + + #from 100Genomes site: @@ -100,4 +116,4 @@ #POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required) #ID semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (Alphanumeric String) #REF reference base(s): Each base must be one of A,C,G,T,N. Bases should be in uppercase. Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For InDels, the reference String must include the base before the event (which must be reflected in the POS field). (String, Required). -#ALT comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N, or an angle-bracketed ID String (””). If there are no alternative alleles, then the missing value should be used. Bases should be in uppercase. (Alphanumeric String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) +#ALT comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N, or an angle-bracketed ID String (””). If there are no alternative alleles, then the missing value should be used. Bases should be in uppercase. (Alphanumeric String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) \ No newline at end of file