comparison vcf2lv.sh @ 0:1209f18a5a83 draft

Uploaded
author saskia-hiltemann
date Mon, 03 Aug 2015 05:01:15 -0400
parents
children 885ba15c2564
comparison
equal deleted inserted replaced
-1:000000000000 0:1209f18a5a83
1 #!/bin/bash
2
3 vcffile=$1
4 outputfile=$2
5
6 # vcf columns: CHROM-POS-ID-REF-ALT
7 # LV cloumns: variantId-chromosome-start-end-reference-alleleSeq-xRef
8
9
10 # add chr prefix if not present
11 # determine varType (snp, ins, del, sub)
12 # convert coordinates to 0-based halfopen
13 # calculate end coordinate from position and length
14 # remove leading reference base from the non-SNP variants, update position
15
16 awk 'BEGIN{
17 FS="\t";
18 OFS="\t";
19 count=0;
20
21 #output new header
22 print "variantId", "chromosome", "begin", "end", "varType", "reference", "alleleSeq", "xRef"
23 }{
24
25 if(substr($0,1,1)!="#" && $5 != "."){ #skip header or nonvariant entries (period in ALT column)
26
27 # detect multivariants
28 chrom=$1
29 pos=$2
30 ref=$4
31 #alt=$5
32 reflen=length($4)
33
34 # add chr prefix if needed
35 if(substr($1,1,3)!="chr")
36 chromosome="chr"$1
37 else
38 chromosome=chrom
39
40 # split ALT column in case of multiple variant alleles
41 split($5,alleles,",");
42
43 for (i in alleles) {
44 alt=alleles[i]
45
46
47 # determine varType
48 if(length(ref) == 1 && length(alt) == 1)
49 varType="snp"
50 else if (length(ref) == 1 )
51 varType="ins"
52 else if (length(alt) == 1 )
53 varType="del"
54 else
55 varType="sub"
56
57 # determine start and end coordinates in 0-based half-open coordinate system
58
59 if (varType=="snp"){
60 start=pos-1
61 end=pos
62 }
63 else if (varType=="ins"){
64 start=pos
65 end=pos
66 }
67 else if (varType=="del"){
68 start=pos
69 end=pos+(reflen-1)
70 }
71 else if (varType=="sub"){
72 start=pos
73 end=pos+(reflen-1)
74 }
75
76 # remove leading reference base
77 if (varType!="snp" && substr(ref,1,1)==substr(alt,1,1)){ #subs not mandatory leading reference base :s
78 reference=substr(ref,2)
79 alleleSeq=substr(alt,2)
80 }
81 else{
82 reference=ref
83 alleleSeq=alt
84 }
85
86 #print output variant(s)
87
88 if(chromosome != "chrM")
89 print count, chromosome, start, end, varType, reference, alleleSeq, ""
90
91 count+=1
92 }
93 }
94 }END{}' $vcffile > $outputfile
95
96
97
98 #from 100Genomes site:
99
100 #CHROM chromosome: an identifier from the reference genome. All entries for a specific CHROM should form a contiguous block within the VCF file.(Alphanumeric String, Required)
101 #POS position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. (Integer, Required)
102 #ID semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (Alphanumeric String)
103 #REF reference base(s): Each base must be one of A,C,G,T,N. Bases should be in uppercase. Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For InDels, the reference String must include the base before the event (which must be reflected in the POS field). (String, Required).
104 #ALT comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N, or an angle-bracketed ID String (”<ID>”). If there are no alternative alleles, then the missing value should be used. Bases should be in uppercase. (Alphanumeric String; no whitespace, commas, or angle-brackets are permitted in the ID String itself)