Mercurial > repos > saskia-hiltemann > virtual_normal_analysis
comparison vcf2lv.sh @ 4:58815aed4ec3 draft default tip
few bugfixes in VCF-2-variantlist
| author | saskia-hiltemann |
|---|---|
| date | Wed, 04 Nov 2015 05:06:12 -0500 |
| parents | 885ba15c2564 |
| children |
comparison
equal
deleted
inserted
replaced
| 3:ac09a5aaed0b | 4:58815aed4ec3 |
|---|---|
| 17 FS="\t"; | 17 FS="\t"; |
| 18 OFS="\t"; | 18 OFS="\t"; |
| 19 count=0; | 19 count=0; |
| 20 | 20 |
| 21 #output new header | 21 #output new header |
| 22 print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq" | 22 print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq", "xRef" > "headerline.txt" |
| 23 }{ | 23 }{ |
| 24 | 24 |
| 25 if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column) | 25 if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column) |
| 26 | 26 |
| 27 # detect multivariants | 27 # detect multivariants |
| 28 chrom=$1 | 28 chrom=$1 |
| 29 pos=$2 | 29 pos=$2 |
| 30 ref=$4 | 30 ref=$4 |
| 31 #alt=$5 | 31 #alt=$5 |
| 32 reflen=length($4) | 32 reflen=length($4) |
| 33 | |
| 34 # excel adds quotes sometimes :s | |
| 35 gsub(/"/,"",ref) | |
| 36 gsub(/"/,"",alt) | |
| 33 | 37 |
| 34 # add chr prefix if needed | 38 # add chr prefix if needed |
| 35 if(substr($1,1,3)!="chr") | 39 if(substr($1,1,3)!="chr") |
| 36 chromosome="chr"$1 | 40 chromosome="chr"$1 |
| 37 else | 41 else |
| 45 | 49 |
| 46 | 50 |
| 47 # determine varType | 51 # determine varType |
| 48 if(length(ref) == 1 && length(alt) == 1) | 52 if(length(ref) == 1 && length(alt) == 1) |
| 49 varType="snp" | 53 varType="snp" |
| 50 else if (length(ref) == 1 ) | 54 else if (length(ref) == 1 && substr(ref,1,1)==substr(alt,1,1) ) |
| 51 varType="ins" | 55 varType="ins" |
| 52 else if (length(alt) == 1 ) | 56 else if (length(alt) == 1 && substr(ref,1,1)==substr(alt,1,1) ) |
| 53 varType="del" | 57 varType="del" |
| 54 else | 58 else |
| 55 varType="sub" | 59 varType="sub" |
| 56 | 60 |
| 57 # determine start and end coordinates in 0-based half-open coordinate system | 61 # determine start and end coordinates in 0-based half-open coordinate system |
| 60 start=pos-1 | 64 start=pos-1 |
| 61 end=pos | 65 end=pos |
| 62 } | 66 } |
| 63 else if (varType=="ins"){ | 67 else if (varType=="ins"){ |
| 64 start=pos | 68 start=pos |
| 65 end=pos | 69 end=pos |
| 66 } | 70 } |
| 67 else if (varType=="del"){ | 71 else if (varType=="del"){ |
| 68 start=pos | 72 start=pos |
| 69 end=pos+(reflen-1) | 73 end=pos+(reflen-1) |
| 70 } | 74 } |
| 71 else if (varType=="sub"){ | 75 else if (varType=="sub"){ |
| 72 start=pos | 76 start=pos-1 |
| 73 end=pos+(reflen-1) | 77 end=pos+(reflen-1) |
| 74 } | 78 } |
| 75 | 79 |
| 76 # remove leading reference base | 80 # remove leading reference base |
| 77 if (varType!="snp" && substr(ref,1,1)==substr(alt,1,1)){ #subs not mandatory leading reference base :s | 81 if ( varType!="snp" && substr(ref,1,1)==substr(alt,1,1) ){ #subs not mandatory leading reference base :s |
| 78 reference=substr(ref,2) | 82 reference=substr(ref,2) |
| 79 alleleSeq=substr(alt,2) | 83 alleleSeq=substr(alt,2) |
| 84 if (varType =="sub"){ | |
| 85 start+=1 | |
| 86 } | |
| 80 } | 87 } |
| 81 else{ | 88 else{ |
| 82 reference=ref | 89 reference=ref |
| 83 alleleSeq=alt | 90 alleleSeq=alt |
| 84 } | 91 } |
| 85 | 92 |
| 86 #print output variant(s) | 93 #print output variant(s) |
| 87 | 94 |
| 88 print count, chromosome, start, end, varType, reference, alleleSeq | 95 if(chromosome == "chr1" || chromosome == "chr2" || chromosome == "chr3" || chromosome == "chr4" || chromosome == "chr5" || chromosome == "chr6" || chromosome == "chr7" || chromosome == "chr8" || chromosome == "chr9" || chromosome == "chr10" || chromosome == "chr11" || chromosome == "chr12" || chromosome == "chr13" || chromosome == "chr14" || chromosome == "chr15" || chromosome == "chr16" || chromosome == "chr17" ||chromosome == "chr18" ||chromosome == "chr19" ||chromosome == "chr20" ||chromosome == "chr21" ||chromosome == "chr22" ||chromosome == "chrX" ||chromosome == "chrY" ) |
| 96 print count, chromosome, start, end, varType, reference, alleleSeq, "" | |
| 89 | 97 |
| 90 count+=1 | 98 count+=1 |
| 91 } | 99 } |
| 92 } | 100 } |
| 93 }END{}' $vcffile > $outputfile | 101 }END{}' $vcffile > $outputfile.almost |
| 94 | 102 |
| 95 | 103 |
| 104 | |
| 105 # due to overlapping variants that we reduce to more canonical forms, variants may have become out of order, so resort to be sure | |
| 106 sort -k2,2V -k3,3g $outputfile.almost > $outputfile.almost2 | |
| 107 | |
| 108 cat headerline.txt $outputfile.almost2 > $outputfile | |
| 109 | |
| 110 | |
| 111 | |
| 96 | 112 |
| 97 #from 100Genomes site: | 113 #from 100Genomes site: |
| 98 | 114 |
| 99 #CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required) | 115 #CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required) |
| 100 #POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required) | 116 #POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required) |
