comparison gfapts/inc/annovar/convert2annovar.pl @ 0:f753b30013e6 draft

Uploaded
author rdaveau
date Fri, 29 Jun 2012 10:20:55 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f753b30013e6
1 #!/usr/bin/perl
2 use warnings;
3 use strict;
4 use Getopt::Long;
5 use Pod::Usage;
6
7 our $VERSION = '$Revision: 466 $';
8 our $LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $';
9
10 our ($verbose, $help, $man);
11 our ($variantfile);
12 our ($outfile, $format, $includeinfo, $snpqual, $snppvalue, $coverage, $maxcoverage, $chr, $chrmt, $altcov, $fraction, $species, $filterword, $confraction, $allallele);
13
14 our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-');
15
16
17 GetOptions('verbose'=>\$verbose, 'help|h'=>\$help, 'man'=>\$man, 'outfile=s'=>\$outfile, 'format=s'=>\$format, 'includeinfo'=>\$includeinfo,
18 'snpqual=f'=>\$snpqual, 'snppvalue=f'=>\$snppvalue, 'coverage=i'=>\$coverage, 'maxcoverage=i'=>\$maxcoverage, 'chr=s'=>\$chr, 'chrmt=s'=>\$chrmt,
19 'fraction=f'=>\$fraction, 'altcov=i'=>\$altcov,
20 'species'=>\$species, 'filter=s'=>\$filterword, 'confraction=f'=>\$confraction, 'allallele!'=>\$allallele) or pod2usage ();
21
22 $help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
23 $man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
24 @ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
25 @ARGV == 1 or pod2usage ("Syntax error");
26
27 ($variantfile) = @ARGV;
28
29 $chrmt ||= 'M';
30
31 if (not $format) {
32 $format = 'pileup';
33 print STDERR "NOTICE: the default --format argument is set as 'pileup'\n";
34 }
35
36 if (defined $outfile) {
37 open (STDOUT, ">$outfile") or die "Error: cannot write to output file $outfile: $!\n";
38 }
39
40 defined $snpqual and $format eq 'pileup' || $format eq 'vcf4' || pod2usage ("Error in argument: the --snpqual is supported only for the 'pileup' or 'vcf4' format");
41 defined $snppvalue and $format eq 'gff3-solid' || pod2usage ("Error in argument: the --snppvalue is supported only for the 'gff3-solid' format");
42 if (not defined $snpqual and $format eq 'pileup') {
43 $snpqual = 20;
44 print STDERR "NOTICE: the default --snpqual argument for pileup format is set as 20\n";
45 }
46
47 if (not defined $snppvalue) {
48 $snppvalue = 1; #by default, do not use any of the P-value cutoff in filtering out GFF3-SOLID files (this is differnt from handling pileup files)
49 }
50
51 if (not defined $coverage) {
52 $coverage = 0;
53 }
54
55 if (defined $fraction) {
56 $format eq 'pileup' or $format eq 'vcf4' or pod2usage ("Error in argument: the '--fraction' argument is supported for the pileup or vcf4 format only");
57 $format eq 'vcf4' and print STDERR "NOTICE: the --fraction argument works ONLY on indels for vcf4 format\n";
58 $fraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --fraction argument must be between 0 and 1 inclusive");
59 } else {
60 $fraction = 0;
61 }
62
63 if (defined $confraction) {
64 $format eq 'vcf4' and print STDERR "NOTICE: the --confraction argument works ONLY on indels for vcf4 format\n";
65 $confraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --confraction argument must be between 0 and 1 inclusive");
66 } else {
67 $confraction = 0;
68 }
69
70 if (defined $altcov) {
71 $format eq 'pileup' or pod2usage ("Error in argument: the '--altcov' argument is supported for the '--format pileup' only");
72 $altcov < $coverage or pod2suage ("Error in argument: the --altcov argument must be less than --coverage");
73 $altcov > 0 or pod2suage ("Error in argument: the --altcov argument must be a positive integer");
74 }
75
76 if (defined $species) {
77 $format eq 'gff3-solid' or pod2usage ("Error in argument: the '--species' argument is only necessary for the '--format gff3-solid'");
78 }
79
80 if ($allallele) {
81 $format eq 'vcf4' or pod2usage ("Error in argument: the '--allallele' argument is only supported for the '--format vcf4'");
82 }
83
84 if ($format eq 'pileup') {
85 convertPileup ($variantfile);
86 } elsif ($format eq 'cg') {
87 convertCG ($variantfile);
88 } elsif ($format eq 'gff3-solid') {
89 convertGFF3SolidSNP ($variantfile);
90 } elsif ($format eq 'soap') {
91 print STDERR "WARNING: the support for '--format soap' is not well developed yet and may contain bugs for indel analysis.\n";
92 convertSOAP ($variantfile);
93 } elsif ($format eq 'maq') {
94 print STDERR "WARNING: the support for '--format maq' is not well developed yet and may contain bugs.\n";
95 convertMAQSNP ($variantfile);
96 } elsif ($format eq 'casava') {
97 if (not defined $chr) {
98 pod2usage ("Error in argument: please supply --chr argument for the '--format casava'");
99 }
100 convertCASAVA ($variantfile, $chr);
101 } elsif ($format eq 'vcf4') {
102 convertVCF4 ($variantfile);
103 } elsif ($format eq 'annovar') {
104 convertANNOVAR ($variantfile);
105 } else {
106 pod2usage ("Error in argument: the --format $format is not currently supported. Please contact ANNOVAR developer for adding the support");
107 }
108
109
110 sub convertPileup {
111 my ($variantfile) = @_;
112 my ($countline, $countvar, $counthom, $counthet, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0/;
113
114 if ($variantfile eq 'stdin') {
115 *VAR = *STDIN;
116 } else {
117 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
118 }
119 print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, SNP quality, total reads, reads with mutation\n";
120
121 while (<VAR>) {
122 s/[\r\n]+$//;
123 $countline++;
124 my $hom = 'hom';
125 my @field = split (/\t/, $_);
126 @field >= 10 or die "Error: invalid record found in pileupfile $variantfile (at least 10 fields expected): <$_>\n";
127 my ($chr, $pos, $wt, $call, @other) = @field;
128 my ($cons_qual, $snp_quality, $readcount, $readallele) = @field[4,5,7,8];
129 $chr =~ s/^chr//;
130 $wt = uc $wt; #wt may or may not in upper case, it depends on the input FASTA file
131 $call = uc $call; #usually call is in upper case
132 $readallele = uc $readallele; #lower case indicate the opposite strand
133
134 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
135
136 $snp_quality >= $snpqual or next; #quality of the variant call
137 $readcount >= $coverage or next; #read count of the variant call
138 $maxcoverage and $readcount <= $maxcoverage || next; #maximum coverage of the variant call
139
140 if ($wt eq '*') { #indel
141 #example:
142 #1 970271 * +C/+C 39 106 44 5 +C * 1 4 0 0 0
143 #1 1548977 * */+CCG 29 29 42 3 * +CCG 2 1 0 0 0
144 #1 1674810 * */+C 24 24 42 6 * +C 5 1 0 0 0
145 #1 968466 * -CT/-CT 53 339 55 5 -CT * 5 0 0 0 0
146 #1 1093600 * -GAAA/* 29 29 53 3 -GAAA * 1 2 0 0 0
147 #1 1110101 * */-A 41 41 17 6 * -A 5 1 0 0 0
148 #1 1215395 * */-TC 26 26 32 4 * -TC 3 1 0 0 0
149 my @obs = split (/\//, $call); #example: '+AG/+AG' as homozygotes, '*/-TA' or '*/+T' as heterozygotes
150 @obs == 2 or die "Error: pileup record contains more than two alternative alleles: <$_>\n";
151 my ($end, $ref, $alt);
152 my ($indelreadcount); #number of reads supporting the indel
153
154
155 if ($obs[0] eq $obs[1]) {
156 #something weird may occur in SamTools output: 22 15231121 * */* 360 0 32 158 * +GA 156 2 0 0 0
157 $obs[0] eq '*' and next;
158
159 #for deletions, SAMTOOLS represent deletion using a location before the first deleted base in the reference sequence coordinate system
160 #for example, a deletion in Samtools output is "1 109266688 * */-CTT 1429 1429 58 43 * -CTT 24 19 0 0 0"
161 #the correct ANNOVAR input (for rs35029887) should be "1 109266689 109266691 CTT - het 1429"
162 #insertions are fine without change; for example, check rs5745466 in Genome Browser; samtools report "1 76119508 * +AT/+AT"
163 #for this insertion, ANNOVAR input file (for rs5745466) becomes "1 76119508 76119508 - AT hom 1601"
164
165 if ($obs[0] =~ m/^\-/) {
166 $pos++; #add 1 to position in deletion
167 }
168
169 $indelreadcount = calculateIndelReadCount ($obs[0], \@field);
170 $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
171 defined $altcov and $indelreadcount >= $altcov || next;
172
173 ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]);
174 print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
175 $counthom++;
176 } else {
177 $hom = 'het';
178 if ($obs[0] =~ m/^[\-\+]/) {
179 $obs[0] =~ m/^\-/ and $pos++;
180 ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]);
181 $indelreadcount = calculateIndelReadCount ($obs[0], \@field);
182 $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
183 defined $altcov and $indelreadcount >= $altcov || next;
184
185 print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
186 $counthet++;
187 }
188 if ($obs[1] =~ m/^[\-\+]/) {
189 $obs[1] =~ m/^\-/ and $pos++;
190 ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[1]);
191 $indelreadcount = calculateIndelReadCount ($obs[1], \@field);
192 $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
193 defined $altcov and $indelreadcount >= $altcov || next;
194
195 print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
196 $counthet++;
197 }
198 }
199 $countindel++;
200 } else {
201 #1 798494 G A 36 36 58 3 AAA bbb
202 #1 798677 T K 33 33 52 26 ,$.,,G.GG,.,......,..G,,... b`bbBaJIbFbZWaTNQbb_VZcbbb
203 #1 856182 G A 42 42 50 5 AAAAA B\bbb
204 #1 861034 A M 48 48 49 14 ,$,.,..,cc.c.,c bbBbb`]BFbHbBB
205 #1 864289 T K 22 22 56 6 .g,,g, BH^_BB
206
207 $wt eq $call and next; #this is not a SNP
208 my $obs = $iupac{$call} or die "Error: invalid best call ($call) in <$_>\n";
209 my @obs = split (//, $obs);
210 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
211 if ($obs[0] ne $obs[1]) {
212 $hom = 'het';
213 }
214
215
216 if ($obs[0] eq $wt) { #obs[0] is guaranteed to be an alternative allele
217 @obs = @obs[1,0];
218 }
219 if ($wt eq 'A' and $obs[0] eq 'G' or $wt eq 'G' and $obs[0] eq 'A' or $wt eq 'C' and $obs[0] eq 'T' or $wt eq 'T' and $obs[0] eq 'C') {
220 unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) {
221 $countti++;
222 }
223
224 } else {
225 unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) {
226 $counttv++;
227 }
228 }
229
230 my $mutallelecount;
231
232 if ($obs[1] eq $wt) { #het SNP
233 if ($chr eq $chrmt) {
234 $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
235 }
236 $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
237 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
238 defined $altcov and $mutallelecount >= $altcov || next;
239
240 print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
241 $counthet++;
242 } elsif ($obs[1] ne $obs[0]) { #het SNP but both differ from reference allele
243 if ($chr eq $chrmt) {
244 $hom = calculateAllelicFraction ($obs[1], $field[8], $readcount);
245 }
246 $mutallelecount = calculateMutAlleleCount ($obs[1], $readallele);
247 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
248 defined $altcov and $mutallelecount >= $altcov || next;
249
250 print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[1], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
251 if ($chr eq $chrmt) {
252 $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
253 }
254 $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
255 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
256 defined $altcov and $mutallelecount >= $altcov || next;
257
258 print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
259 $counthet++;
260 $counthet++;
261 } else { #homo SNP
262 if ($chr eq $chrmt) {
263 $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
264 }
265 $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
266 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
267 defined $altcov and $mutallelecount >= $altcov || next;
268
269 print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
270 $counthom++;
271 }
272 $countsnp++;
273 }
274 $countvar++;
275 }
276 my $triallelic = $countsnp-$countti-$counttv;
277 print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n";
278 print STDERR "NOTICE: Among ${\($counthet+$counthom)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n";
279 print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n";
280 }
281
282 sub calculateIndelReadCount {
283 my ($obs, $field) = @_;
284 #make sure to use upper case in the comparison, for example:
285 #chr10 130133 * */-ca 189 189 59 31 * -ca 27 4 0 0 0
286 if ($obs eq uc $field->[8]) {
287 return $field->[10];
288 } elsif ($obs eq uc $field->[9]) {
289 return $field->[11];
290 } else {
291 die "Error: invalid record in pileup file (indel counts cannot be inferred): <$obs> vs <@$field>\n";
292 }
293 }
294
295 sub calculateMutAlleleCount {
296 my ($allele, $string) = @_; #they should be already both in upper case
297 $string =~ s/\^.//g; #^ is followed by mapping quality
298 $string =~ s/\$//g;
299 $string =~ s/[+-]1[^\d]//g; #1 followed by a non-number
300 $string =~ s/[+-]2..//g;
301 $string =~ s/[+-]3...//g;
302 $string =~ s/[+-]4....//g;
303 $string =~ s/[+-]5.....//g;
304 $string =~ s/[+-]6......//g;
305 $string =~ s/[+-]7.......//g;
306 $string =~ s/[+-]8........//g;
307 $string =~ s/[+-]9.........//g;
308 $string =~ s/[+-]10..........//g;
309
310 #make sure to use upper case letter
311 my @string = split (//, uc $string);
312 my $count = 0;
313 for my $i (0 .. @string-1) {
314 $allele eq $string[$i] and $count++;
315 }
316 return $count;
317 }
318
319 sub calculateAllelicFraction {
320 my ($obs, $readbase, $readcount) = @_;
321 my @readbase = split (//, $readbase);
322 my $count=0;
323 for my $i (0 .. @readbase-1) {
324 uc $obs eq uc $readbase[$i] and $count++;
325 }
326 my $hom = $count/$readcount;
327 length ($hom) > 5 and $hom > 0.001 and $hom = sprintf ("%.3f", $hom);
328 return $hom;
329 }
330
331 sub recalculateEndRefObs { #recalculate end position, reference allele and observed allele
332 my ($end, $ref, $obs) = @_;
333 if ($obs =~ m/^\-(\w+)/) { #deletion
334 $end += (length ($1)-1);
335 $ref = $1;
336 $obs = '-';
337 } elsif ($obs =~ m/^\+(\w+)/) { #insertion
338 $ref = '-';
339 $obs = $1;
340 } else {
341 die "Error: cannot handle $end, $ref, $obs\n";
342 }
343 return ($end, $ref, $obs);
344 }
345
346 sub convertCG {
347 my ($variantfile) = @_;
348
349 my ($foundheader, $countline, @field);
350 my ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/;
351 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
352 print STDERR "NOTICE: Converting variants from $variantfile\n";
353 while (<VAR>) {
354 s/[\r\n]+$//;
355 $countline++;
356 if (m/^>locus/) {
357 $foundheader++;
358 }
359 if (not $foundheader) {
360 $countline > 50 and die "Error: invalid CG-var file format for $variantfile (>locus record is not found within the first 50 lines)\n";
361 next;
362 }
363 my ($locus, $ploidy, $haplo, $chr, $start, $end, $vartype, $ref, $obs, $score, $haplolink, $xref) = split (/\t/, $_);
364 $chr =~ s/^chr//;
365 $vartype eq 'ins' or $start++; #CG use zero-start, half-open coordinate. Insertion does not need to be processed (example, "16476 2 2 chr1 751820 751820 ins T 49 dbsnp:rs59038458")
366 $obs eq '' and $obs = '-';
367 $ref eq '' and $ref = '-';
368
369 if ($vartype =~ m/^snp|ins|del|delins|sub$/) { #in new versions of the files, "sub" is used instead of "delins".
370 #$chr eq 'M' and next; #ignore chrM markers as they are not diploid
371 if ($chr eq $prechr and $start eq $prestart and $end eq $preend and $obs eq $preobs) { #homozygous mutation
372 print $chr, "\t", $start, "\t", $end, "\t", $ref, "\t", $obs, "\t", $vartype, "\t", ($score+$prescore)/2, "\t", "hom\t", $xref, "\n";
373 ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/;
374 } else {
375 if ($prestart and $preend) {
376 print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n";
377 }
378 ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = ($chr, $start, $end, $vartype, $ref, $obs, $score, $xref);
379 }
380 }
381 }
382 if ($prestart and $preend) {
383 print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n";
384 }
385 print STDERR "NOTICE: Done with $countline lines\n";
386 }
387
388 sub convertGFF3SolidSNP {
389 my ($variantfile) = @_;
390 my ($countline, $countvar, $countallvar, @other) = (0, 0, 0);
391 my ($unknown_count); #count of variants with 'unknown' variation type
392
393 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
394 $_ = <VAR>;
395 s/[\r\n]+$//;
396 m/^##gff-version\s+3/ or die "Error: invalid first line in GFF3 file ('##gff-version 3' expected): <$_>\n";
397 $_ = <VAR>;
398 s/[\r\n]+$//;
399 m/^##solid-gff-version/ or print STDERR "WARNING: problematic second line in GFF3-SOLiD file ('##solid-gff-version' expected): <$_>\n";
400
401 print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, variant score (P-value), total clipped normal coverage reads, total reads with mutated allele\n";
402
403 while (<VAR>) {
404 s/[\r\n]+$//;
405 $countline++;
406 m/^##/ and next; #header of comment lines
407 m/^#/ and next; #header of results lines
408
409 my @field = split (/\t/, $_);
410 @field == 9 or die "Error: invalid record found in $variantfile (10 fields expected): <$_>\n";
411 my ($chr, $program, $type, $pos, $end, $score, $attribute) = @field[0,1,2,3,4,5,8]; #score is the P-value for the SNP calls
412 $chr eq 'chr_name' and next; #header line
413
414 if ($score ne '.') {
415 $score >=0 and $score <=1 or die "Error: invalid score record found in file (0-1 range expected): <$_>\n";
416 $score <= $snppvalue or next;
417 }
418
419 if ($species and $species eq 'human') {
420 $chr eq '23' and $chr = 'X';
421 $chr eq '24' and $chr = 'Y';
422 $chr eq '25' and $chr = 'M';
423 }
424
425 $includeinfo and @other = ($attribute); #unless -includeinfo is set, the other will not be printed
426
427 my ($readcount, $mutallelecount) = ('.', '.'); #total coverage, coverage for mutated alleles
428
429 if ($type eq 'unknown') {
430 #SOLiD GDD3 may have unknown variation types
431 #chr1 AB_SOLiD Small Indel Tool unknown 3833062 3833153 1 . . ID=5483;len=no_call;allele-call-pos=3833062;allele-call=/CCAC;allele-pos=3833057;alleles=atccatccacccatc/aTCCATCCACCCACCCATC/NO_CALL;allele-counts=REF,2,2;tight_chrom_pos=none;loose_chrom_pos=3833058-3833069;no_nonred_reads=3;coverage_ratio=8.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_50_10_r,L1_1_50_15_r,L1_1_50_15_r,L1_1_50_12_r;bead_ids=1018_196_970,699_1263_465,220_513_923,2022_1532_1071;overall_qvs=4,6,2,50;no_mismatches=5,4,2,0;read_pos=27,29,31,13;from_end_pos=23,21,19,37;strands=+,+,+,+;tags=R3,F3,F3,F3;indel_sizes=-92,-112,4,4;non_indel_no_mismatches=0,0,8,0;unmatched-lengths=50,50,50,50;ave-unmatched=50.0000;anchor-match-lengths=48,49,49,49;ave-anchor-length=48.7500;read_seqs=G23223321322112233223100132013201320110011001322332,T33223321322112233223100132013201320110013021322332,T33223321322112233223100132013201320110011001322332,T31001320132013201100110013223322113030332233113032;base_qvs=;non_indel_seqs=T21322332211221121322332230321212121223322332233221,G12020202202020012001200213022002130012332310122030,G12020202202020012001000210022012110312331331122030,G22111012101031010100002002321020002202121121313021;non_indel_qvs=
432 $unknown_count++;
433 next; #do not count this one!
434 }
435
436 if ($program eq 'SOLiD_diBayes' or $program eq 'AB_SOLiD SNP caller') { #SNP variants
437 #detailed description can be found at http://solidsoftwaretools.com/gf/download/docmanfileversion/208/866/DiBayes_Documentation_v1.2.pdf
438 #chr1 SOLiD_diBayes SNP 559817 559817 0.094413 . . genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag=
439 #chr1 SOLiD_diBayes SNP 714068 714068 0.000000 . . genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag=
440 #chr1 SOLiD_diBayes SNP 714835 714835 0.041579 . . genotype=R;reference=A;coverage=5;refAlleleCounts=3;refAlleleStarts=3;refAlleleMeanQV=18;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=20;diColor1=02;diColor2=20;het=1;flag=
441
442 $pos == $end or die "Error: invalid record found in GFF3-SOLiD file: start and end discordant: <$_>\n";
443
444 my ($wt, $call);
445
446 if ($attribute =~ m/ref_base=(\w)/) {
447 $wt = $1;
448 } elsif ($attribute =~ m/reference=(\w)/) {
449 $wt = $1;
450 } else {
451 die "Error: invalid record found in GFF3-SOLiD file (ref_base/reference was not found): <$_>\n";
452 }
453
454 if ($attribute =~ m/consen_base=(\w)/) {
455 $call = $1;
456 } elsif ($attribute =~ m/genotype=(\w)/) {
457 $call = $1;
458 } else {
459 die "Error: invalid record found in GFF3-SOLiD file (consen_base was not found): <$_>\n";
460 }
461
462 if ($attribute =~ m/coverage=(\d+)/) {
463 $readcount = $1;
464 $readcount >= $coverage or next; #read count of the variant call
465 $maxcoverage and $readcount <= $maxcoverage || next;
466 }
467 if ($attribute =~ m/novelAlleleCounts=(\d+)/) {
468 $mutallelecount = $1;
469 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
470 defined $altcov and $mutallelecount >= $altcov || next;
471 }
472
473 my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
474 my @obs = split (//, $obs);
475 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
476 if ($obs[0] eq $wt and $obs[1] eq $wt) {
477 die "Error: reference alleles are identical to observed alleles: <$_>\n";
478 } elsif ($obs[0] eq $wt) {
479 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
480 } elsif ($obs[1] eq $wt) {
481 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
482 } elsif ($obs[1] ne $obs[0]) {
483 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
484 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
485 $countallvar++;
486 } else {
487 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
488 }
489 } elsif ($program eq 'AB_CNV_PIPELINE') { #CNV
490 if ($attribute =~ m/copynum=(\d+)/ or $attribute =~ m/copynumber=(\d+)/) {
491 if ($1 < 2) {
492 print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
493 } elsif ($1 > 2) {
494 print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
495 }
496 } else {
497 print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
498 }
499 } elsif ($program eq 'AB_SOLiD Large Indel Tool') { #CNV
500 #http://solidsoftwaretools.com/gf/download/docmanfileversion/182/780/Large_Indel_Documentation_v1.0.0.pdf
501 ## [FIELDS] (1) chromosome (2) version (3) indel type (4) breakpoint start (5) breakpoint end (6) p-value (7) NA (8) NA (9) attributes
502 #chr10 AB_SOLiD Large Indel Tool insertion 151910 151969 2.77548e-11 . . dev=-71;avgDev=-1.63884;zygosity=HOMOZYGOUS;nRef=0;nDev=14;refDev=0;devDev=-1.60924;refVar=0;devVar=0.0159438;beadIds=1750_720_1641,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279,
503 #chr10 AB_SOLiD Large Indel Tool insertion 154109 154729 2.1559e-11 . . dev=-66;avgDev=-1.51253;zygosity=HOMOZYGOUS;nRef=0;nDev=15;refDev=0;devDev=-1.02864;refVar=0;devVar=0.133236;beadIds=1728_1671_1739,1250_231_25,811_783_1090,1035_908_491,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279,
504 my ($call, @call, $zygosity);
505 if ($attribute =~ m#zygosity=HEMIZYGOUS#) {
506 $zygosity = 'het';
507 } elsif ($attribute =~ m#zygosity=HOMOZYGOUS#) {
508 $zygosity = 'hom';
509 } else {
510 $zygosity = 'unk';
511 }
512 if ($type eq 'insertion') {
513 #the true boundary is unknown (start is different from end) so we cannot use "-" to represent reference allele.
514 print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", 0, "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n";
515 } elsif ($type eq 'deletion') {
516 print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n";
517 }
518 } elsif ($program eq 'AB_SOLiD Small Indel Tool') { #small indels
519 #typical simple insertion and deletions
520 #chr1 AB_SOLiD Small Indel Tool deletion 1352612 1352614 1 . . ID=1290;del_len=3;allele-call-pos=1352612;allele-call=cct/;allele-pos=1352610;alleles=cccctccat/cCCCAT;allele-counts=REF,2;tight_chrom_pos=1352612-1352614;loose_chrom_pos=1352612-1352614;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_3_r,L1_1_25_8_r;bead_ids=1470_2000_506,822_1710_1767;overall_qvs=18,19;no_mismatches=3,3;read_pos=6,13;from_end_pos=19,12;strands=-,+;tags=R3,R3;indel_sizes=-3,-3;non_indel_no_mismatches=1,-1;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=24,99;ave-anchor-length=61.5000;read_seqs=G0112310001100003120031200,G0300213000011000132110021;base_qvs=;non_indel_seqs=T2120033002022200220000002,;non_indel_qvs=
521 #chr1 AB_SOLiD Small Indel Tool insertion_site 1311162 1311162 1 . . ID=1249;ins_len=1;allele-call-pos=1311162;allele-call=/G;allele-pos=1311161;alleles=gaggggggg/GAGGGGGGGG/NO_CALL;allele-counts=REF,3,1;tight_chrom_pos=none;loose_chrom_pos=1311160-1311169;no_nonred_reads=3;coverage_ratio=4.6667;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_50_10_r,L1_1_25_2_r,L1_1_25_3_r;bead_ids=850_837_429,1160_878_181,404_1050_1881,1084_64_1343;overall_qvs=20,56,25,25;no_mismatches=3,2,2,1;read_pos=11,22,11,11;from_end_pos=14,28,14,14;strands=+,-,-,-;tags=R3,F3,F3,F3;indel_sizes=1,1,1,1;non_indel_no_mismatches=-1,1,0,1;unmatched-lengths=25,50,25,25;ave-unmatched=31.2500;anchor-match-lengths=99,49,24,24;ave-anchor-length=49.0000;read_seqs=G1020001130221020000000020,T03223323210110021000000022122030100020221222222122,T0102210000000221223301000,T0102210000000221220301000;base_qvs=;non_indel_seqs=,G21202030032202013220021321131212021000122300013132,G1331133120001221220120120,G1331133120001221220120220;non_indel_qvs=
522
523 #sometimes, allele-call is ambiguous that requires a "block substitution" representation (although they were annotated as insertion or deletion by SOLiD, they should be treated as block substitution by ANNOVAR)
524 #sometimes, mutiple allele calls may be present at the same site
525 #chr1 AB_SOLiD Small Indel Tool deletion 1209357 1209360 1 . . ID=1101;del_len=4;allele-call-pos=1209357;allele-call=ggtggg/TT;allele-pos=1209355;alleles=ggggtgggggggtt/gGTTGGGGTT/gGTGTTTTGCCTT/NO_CALL;allele-counts=REF,3,1,1;tight_chrom_pos=none;loose_chrom_pos=1209357-1209363;no_nonred_reads=4;coverage_ratio=3.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=0.9888;run_names=L1_1_25_1_r,L1_1_25_2_r,L1_1_25_4_r,L1_1_25_3_r,L1_1_25_7_r;bead_ids=1017_1024_53,1493_1896_615,1794_647_1473,307_1116_687,773_1492_1671;overall_qvs=24,24,28,24,8;no_mismatches=2,3,2,3,2;read_pos=14,9,14,9,15;from_end_pos=11,16,11,16,10;strands=-,+,-,+,+;tags=F3,R3,F3,F3,F3;indel_sizes=-4,-4,-4,-4,3;non_indel_no_mismatches=1,0,0,0,0;unmatched-lengths=25,25,25,25,25;ave-unmatched=25.0000;anchor-match-lengths=24,24,24,24,24;ave-anchor-length=24.0000;read_seqs=T2221100101000101000221100,G0001122000100000101001020,T2221100101000101000221100,T1112200010100010100112000,T1011220000111000130200001;base_qvs=;non_indel_seqs=G0312033221312111022200300,T0111113210210112100001130,G0312133221312111022200300,G0231003132222112000012020,G3121331033101113122312020;non_indel_qvs=
526 #chr1 AB_SOLiD Small Indel Tool deletion 1209436 1209436 1 . . ID=1103;del_len=1;allele-call-pos=1209436;allele-call=ag/A/G;allele-pos=1209434;alleles=tgagggggtt/tGAGGGGTT/tGGGGGGTT;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1209436-1209441;no_nonred_reads=2;coverage_ratio=5.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_25_2_r;bead_ids=1315_1584_2005,1706_194_437;overall_qvs=28,21;no_mismatches=0,3;read_pos=9,7;from_end_pos=16,18;strands=-,-;tags=R3,R3;indel_sizes=-1,-1;non_indel_no_mismatches=-1,0;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=99,24;ave-anchor-length=61.5000;read_seqs=G3001010000011001010000001,G3010100022110010111000110;base_qvs=;non_indel_seqs=,T1112003220020013202122300;non_indel_qvs=
527 #chr1 AB_SOLiD Small Indel Tool insertion_site 1424540 1424540 1 . . ID=1376;ins_len=3;allele-call-pos=1424540;allele-call=tt/CCCAC;allele-pos=1424537;alleles=ttttttg/TTTCCCACTG/NO_CALL;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1424536-1424543;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_7_r,L1_1_50_16_r;bead_ids=703_437_370,1089_878_1744;overall_qvs=1,9;no_mismatches=3,4;read_pos=5,35;from_end_pos=20,15;strands=-,-;tags=R3,F3;indel_sizes=3,3;non_indel_no_mismatches=2,0;unmatched-lengths=25,50;ave-unmatched=37.5000;anchor-match-lengths=24,47;ave-anchor-length=35.5000;read_seqs=G2032002200200000000000020,T30100102220312202103112130230322210121100200002100;base_qvs=;non_indel_seqs=T2121120003012303000000000,G22213300221101011121030022002222300220322213303102;non_indel_qvs=
528 my ($call, @call, $zygosity);
529 if ($attribute =~ m#experimental-zygosity=HEMIZYGOUS#) {
530 $zygosity = 'het';
531 } elsif ($attribute =~ m#experimental-zygosity=HOMOZYGOUS#) {
532 $zygosity = 'hom';
533 } else {
534 $zygosity = 'unk';
535 }
536 $score = '.'; #by default, score=1 in the output
537
538 #no_nonred_reads: Number of reads with unique start positions (non-redundant reads).
539 #coverage_ratio: Clipped normal coverage/number of non-redundant reads.Clipped coverage is where the parts of the read that are within 5 bp at either end are not counted as a part of coverage.
540 if ($attribute =~ m/no_nonred_reads=(\d+);coverage_ratio=([\d\.]+)/) {
541 $readcount = int ($1*$2);
542 $readcount >= $coverage or next; #clipped normal read count of the variant call (this could even be lower than mut allele count)
543 $maxcoverage and $readcount <= $maxcoverage || next;
544 } else {
545 $readcount = '.';
546 }
547 if ($attribute =~ m/allele-counts=REF,(\d+)/) {
548 $mutallelecount = $1;
549 }
550
551 if ($attribute =~ m#allele\-call=([\w\/]+)#) {
552 @call = split (/\//, $1);
553
554 if (@call == 1) { #a simple deletion
555 print $chr, "\t", $pos, "\t", $end, "\t", $call[0], "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
556 } elsif ($call[0] eq '') { #a simple insertion (single or multiple allele)
557 for my $i (1 .. @call-1) {
558 print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
559 $i > 1 and $countallvar++;
560 }
561 } else { #an indel that may have several alleles, or may require a block substitution representation
562 for my $i (1 .. @call-1) {
563 print $chr, "\t", $pos, "\t", $pos+length($call[0])-1, "\t", $call[0], "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
564 $i > 1 and $countallvar++;
565 }
566 }
567 } else {
568 $call = '0';
569 print $chr, "\t", $pos, "\t", $end, "\t", $call, "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
570 }
571 } else {
572 die "Error: unrecognizable genotype calling program encountered (valid types are SOLiD_diBayes, AB_CNV_PIPELINE, AB_SOLiD Large Indel Tool, AB_SOLiD Small Indel Tool): <$_>\n";
573 }
574
575 $countvar++; #variation positions
576 $countallvar++; #all variants (maybe several at one variation position)
577 }
578 print STDERR "NOTICE: Finished processing $variantfile with $countline input lines\n";
579 print STDERR "NOTICE: Wrote variants in $countvar variation positions ($countallvar variants considering multi-allelic ones)\n";
580 $unknown_count and print STDERR "WARNING: $unknown_count variants with 'unknown' variation type were skipped\n";
581 }
582
583
584
585 sub convertSOAP {
586 my ($variantfile) = @_;
587 my ($countline, $countvar, @other);
588
589 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
590
591 while (<VAR>) {
592 s/[\r\n]+$//;
593 $countline++;
594
595 my @field = split (/\t/, $_);
596 if (@field == 18) { #snp file
597 my ($chr, $pos, $wt, $call, @other) = @field;
598 $chr =~ s/^chr//;
599
600 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
601
602 my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
603 my @obs = split (//, $obs);
604 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
605 if ($obs[0] eq $wt and $obs[1] eq $wt) {
606 die "Error: reference alleles are identical to observed alleles: <$_>\n";
607 } elsif ($obs[0] eq $wt) {
608 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
609 } elsif ($obs[1] eq $wt) {
610 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
611 } elsif ($obs[1] ne $obs[0]) {
612 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
613 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
614 $countvar++;
615 } else {
616 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
617 }
618 } elsif (@field == 6) { #indel file
619 my ($chr, $pos, $strand, $indellen, $call, $homo) = @field;
620 $homo eq 'Homo' and $homo = 'hom';
621 $homo eq 'Hete' and $homo = 'het';
622 $chr =~ s/^chr//;
623
624 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
625
626 if ($indellen =~ m/^\+(\d+)$/) { #insertion
627 length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n";
628 print join("\t", $chr, $pos, $pos, '-', $call, $homo), "\n";
629 } elsif ($indellen =~ m/^\-(\d+)$/) { #deletion
630 length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n";
631 print join("\t", $chr, $pos, $pos+$1-1, $call, '-', $homo), "\n";
632 } else {
633 die "Error: invalid record found in SOAPindel file: <$_>\n";
634 }
635 } else {
636 die "Error: invalid record found in $variantfile (18 or 6 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
637 }
638 $countvar++;
639 }
640 print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
641 }
642
643 sub convertANNOVAR {
644 my ($variantfile) = @_;
645 my ($countline, $countvar, $countsnp);
646 my ($countti, $counttv) = (0, 0);
647
648 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
649
650 while (<VAR>) {
651 $countline++;
652
653 my @field = split (/\t/, $_);
654 my ($chr, $start, $end, $ref, $obs) = @field;
655 if ($ref =~ m/^[ACGT]$/ and $obs =~ m/^[ACGT]$/) {
656 if ($ref eq 'A' and $obs eq 'G' or $ref eq 'G' or $obs eq 'A' or $ref eq 'C' and $obs eq 'T' or $ref eq 'T' and $obs eq 'C') {
657 $countti++;
658 } else {
659 $counttv++;
660 }
661 $countsnp++;
662 }
663
664 print;
665 $countvar++;
666 }
667 print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
668 print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions\n";
669 }
670
671 sub convertMAQSNP {
672 my ($variantfile) = @_;
673 my ($countline, $countvar, @other);
674
675 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
676
677 while (<VAR>) {
678 s/[\r\n]+$//;
679 $countline++;
680
681 my @field = split (/\t/, $_);
682 if (@field == 12) { #SNPs
683 my ($chr, $pos, $wt, $call, @other) = @field;
684 $chr =~ s/^chr//;
685
686 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
687
688 my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
689 my @obs = split (//, $obs);
690 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
691 if ($obs[0] eq $wt and $obs[1] eq $wt) {
692 die "Error: reference alleles are identical to observed alleles: <$_>\n";
693 } elsif ($obs[0] eq $wt) {
694 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
695 } elsif ($obs[1] eq $wt) {
696 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
697 } elsif ($obs[1] ne $obs[0]) {
698 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
699 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
700 $countvar++;
701 } else {
702 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
703 }
704 $countvar++;
705 } elsif (@field == 13) { #indels; the deletion start site do not need changes; the duplication start site need additional investigation by ANNOVAR developers
706 my ($chr, $pos, $type, $numread, $call, @other) = @field;
707 $chr =~ s/^chr//;
708
709 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
710
711 my @obs = split (/:/, $call);
712 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
713 if ($obs[0] =~ m/^\-(\d+)/) {
714 my $len = $1;
715 print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[1], "\t", '-', "\t", "het\t", join ("\t", @other), "\n";
716 } elsif ($obs[0] =~ m/^(\d+)/) {
717 my $len = $1;
718 print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
719 }
720 $countvar++;
721 } else {
722 die "Error: invalid record found in $variantfile (12 or 13 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
723 }
724 }
725 print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
726 }
727
728 sub convertCASAVA {
729 my ($variantfile, $chr) = @_;
730 my ($countline, $countvar, @other);
731
732 my ($intype);
733 my ($pos_index, $call_index, $reference_index, $type_index, $score_index, $total_index, $used_index);
734 my ($ref_indel_index, $quality_index, $maxgtype_index, $bp1_reads_index, $ref_reads_index, $indel_reads_index, $other_reads_index);
735 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
736
737 while (<VAR>) {
738 s/[\r\n]+$//;
739 $countline++;
740 my @field;
741
742 if (m/^#/) {
743 s/^#//;
744 if (s/^\$\sCOLUMNS\s//) {
745 @field = split (/\s+/, $_);
746 } else {
747 @field = split (/\t/, $_);
748 }
749 if (m/\bposition\b/ or m/\bpos\b/) {
750 for my $i (0 .. @field-1) {
751 if ($field[$i] eq 'position' or $field[$i] eq 'pos') {
752 $pos_index = $i;
753 } elsif ($field[$i] eq 'modified_call') {
754 $intype = 'snp';
755 print STDERR "NOTICE: Automatically detected input type as $intype\n";
756 $call_index = $i;
757 } elsif ($field[$i] eq 'reference') {
758 $reference_index = $i;
759 } elsif ($field[$i] eq 'type') {
760 $type_index = $i;
761 } elsif ($field[$i] eq 'score') {
762 $score_index = $i;
763 } elsif ($field[$i] eq 'total') {
764 $total_index = $i;
765 } elsif ($field[$i] eq 'used') {
766 $used_index = $i;
767 } elsif ($field[$i] eq 'ref/indel') {
768 $intype = 'indel';
769 print STDERR "NOTICE: Automatically detected input type as $intype\n";
770 $ref_indel_index = $i;
771 } elsif ($field[$i] eq 'Q(indel)') {
772 $quality_index = $i;
773 } elsif ($field[$i] eq 'max_gtype') {
774 $maxgtype_index = $i;
775 } elsif ($field[$i] eq 'bp1_reads') {
776 $bp1_reads_index = $i;
777 } elsif ($field[$i] eq 'ref_reads') {
778 $ref_reads_index = $i;
779 } elsif ($field[$i] eq 'indel_reads') {
780 $indel_reads_index = $i;
781 } elsif ($field[$i] eq 'other_reads') {
782 $other_reads_index = $i;
783 }
784 }
785 }
786 next;
787 }
788
789 $intype or die "Error: unable to recognize the correct type of the input file (make sure that header line is present in $variantfile)\n";
790 @field = split (/\t/, $_);
791
792 if ($intype eq 'snp') { #SNPs
793 defined $pos_index and defined $reference_index and defined $call_index or die "Error: unalbe to find the position, reference and modified_call column header in $variantfile\n";
794 my ($pos, $wt, $obs) = @field[$pos_index, $reference_index, $call_index];
795 my (@other);
796 defined $pos and defined $wt and defined $obs or die;
797 if ($includeinfo) {
798 defined $score_index and push @other, $field[$score_index];
799 defined $total_index and push @other, $field[$total_index];
800 defined $used_index and push @other, $field[$used_index];
801 defined $type_index and push @other, $field[$type_index];
802 }
803
804 length ($obs) == 1 and $obs .= $obs;
805 my @obs = split (//, $obs);
806 @obs == 2 or die "Error: observed allele $obs should correspond to two nucleotide alleles: <$_>\n";
807 if ($obs[0] eq $wt and $obs[1] eq $wt) {
808 die "Error: reference alleles are identical to observed alleles: <$_>\n";
809 } elsif ($obs[0] eq $wt) {
810 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
811 } elsif ($obs[1] eq $wt) {
812 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
813 } elsif ($obs[1] ne $obs[0]) {
814 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
815 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
816 $countvar++;
817 } else {
818 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
819 }
820 $countvar++;
821 } elsif ($intype eq 'indel') { #indels
822 defined $pos_index and defined $ref_indel_index and defined $maxgtype_index or die "Error: unable to find the pos, ref_indel and max_gtype column header in $variantfile\n";
823 my ($pos, $call, $hom, @other) = @field[$pos_index, $ref_indel_index, $maxgtype_index];
824 if ($includeinfo) {
825 defined $quality_index and push @other, $field[$quality_index];
826 defined $maxgtype_index and push @other, $field[$maxgtype_index];
827 defined $bp1_reads_index and push @other, $field[$bp1_reads_index];
828 defined $ref_reads_index and push @other, $field[$ref_reads_index];
829 defined $indel_reads_index and push @other, $field[$indel_reads_index];
830 defined $other_reads_index and push @other, $field[$other_reads_index];
831 }
832
833 #hg19 coordinate below; insertion needs position adjustment!!! deletion is fine
834 #948847 1I CCTCAGGCTT -/A ATAATAGGGC 969 hom 47 het 22 0 16 6 A 1 2
835 #978604 2D CACTGAGCCC CT/-- GTGTCCTTCC 251 hom 20 het 8 0 4 4 CT 1 0
836 #1276974 4I CCTCATGCAG ----/ACAC ACACATGCAC 838 hom 39 het 18 0 14 4 AC 2 4
837 #1289368 2D AGCCCGGGAC TG/-- GGAGCCGCGC 1376 hom 83 het 33 0 25 9 TG 1 0
838 #185137455 11I10M2I TATGTGTCCT -----------TTTTTTATTT--/AAATGATAGACTTTTTTTTTTAA ATTTCAGAAA 1126 het 988 hom 45 20 24 7 N/A 0 0
839 #1276931 2D41M4I CACACACATG CACACACACGCACACACGTGCAATGTGAAAACACCTCATGCAG----/--CACACACGCACACACGTGCAATGTGAAAACACCTCATGCAGACAC ACACATGCAC 548 hom 16 het 8 0 11 11 N/A 0 0
840
841 my @obs = split (/\//, $call);
842 @obs == 2 or die "Error: observed indel allele $call should correspond to two alleles: <$_>\n";
843 if ($obs[0] =~ m/^\-+$/) { #insertion
844 my $len = length ($obs[0]);
845 print $chr, "\t", $pos-1, "\t", $pos-1, "\t", '-', "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
846 } elsif ($obs[1] =~ m/^\-+$/) { #deletion
847 my $len = length ($obs[0]);
848 print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[0], "\t", '-', "\t", $hom, "\t", join ("\t", @other), "\n";
849 } elsif (length ($obs[0]) eq length ($obs[1])) { #block substitution
850 $obs[0] =~ s/\-//g;
851 $obs[1] =~ s/\-//g;
852 print $chr, "\t", $pos, "\t", $pos+length($obs[0])-1, "\t", $obs[0], "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
853 } else {
854 die "Error: invalid record found in indel line: <$_>\n";
855 }
856 $countvar++;
857 } else {
858 die "Error: invalid record found in $variantfile (11 or 15 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
859 }
860 }
861 print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
862 }
863
864 sub convertVCF4 {
865 my ($variantfile) = @_;
866
867 my ($countline, $countvar, $counthom, $counthet, $countunknown, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0 0/;
868
869 my ($source_program); #the program that generated the VCF4 file
870 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
871
872 while (<VAR>) {
873 $countline++;
874
875 if (m/^##fileformat=VCFv(\d+\.)/) {
876 $1<4 and print STDERR "ERROR: Input file is not in VCF version 4 format but is $_" and exit;
877 }
878 if (m/^##UnifiedGenotyper/) {
879 $source_program = 'gatksnp';
880 print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK UnifiedGenotyper\n";
881 print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, RMS mapping quality, quality by depth\n";
882 $fraction and print STDERR "WARNING: the --fraction argument will be ignored for GATK SNP calls!!!\n";
883 $confraction and print STDERR "WARNING: the --confraction argument will be ignored for GATK SNP calls!!!\n";
884 }
885 if (m/^##IndelGenotyper/) {
886 $source_program = 'gatkindel';
887 print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK IndelGenotyper\n";
888 print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, read count supporting indel call, RMS mapping quality\n";
889 }
890
891 m/^#/ and next; #skip comment lines
892 s/[\r\n]+$//; #delete trailing new lines
893 my $otherinfo = $_; #this is the complete line (when -includeinfo is set, the entire line will be included in output file)
894
895 #format description: http://www.1000genomes.org/wiki/Analysis/vcf4.0
896 #standard VCF4 should have 8 columns, but some software may produce more columns (for example, for genotype calls). The first 8 columns should follow the specification
897
898 #example of VCF4 generated by GATK SNP caller
899 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
900 #1 55 . T G 34.82 . DP=2;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=14.16;MQ0=0;QD=17.41;SB=-10.00 GT:DP:GL:GQ 0/1:1:-6.66,-0.30,-0.00:1.76
901 #1 2646 . G A 40.91 . DP=4;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=7.50;MQ0=3;QD=10.23;SB=-10.00 GT:DP:GL:GQ 0/1:1:-7.27,-0.30,-0.00:1.76
902
903 #example of VCF4 generated by GATK indel caller
904 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
905 #1 2525324 . G GC . PASS AC=5,5;DP=12;MM=4.8,3.7142856;MQ=29.0,42.285713;NQSBQ=33.0,46.463768;NQSMM=0.24,0.20289855;SC=0,5,1,6 GT 0/1
906 #1 3553372 . GC G . PASS AC=6,6;DP=6;MM=0.8333333,0.0;MQ=60.0,0.0;NQSBQ=63.533333,0.0;NQSMM=0.0,0.0;SC=0,6,0,0 GT 1/0
907 #1 6093011 . CG C . PASS AC=31,31;DP=32;MM=0.7096774,2.0;MQ=59.64516,60.0;NQSBQ=64.192184,39.666668;NQSMM=0.0,0.11111111;SC=23,8,0,1 GT 1/0
908
909 #example of VCF4 generated by 1000G
910 #CHROM POS ID REF ALT QUAL FILTER INFO
911 #1 533 . G C . PASS AA=.;AC=6;AN=120;DP=423
912 #1 41342 . T A . PASS AA=.;AC=29;AN=120;DP=188
913 #1 41791 . G A . PASS AA=.;AC=5;AN=120;DP=192
914 #1 44449 . T C . PASS AA=C;AC=2;AN=120;DP=166
915 #1 44539 rs2462492 C T . PASS AA=T;AC=2;AN=120;DP=131
916
917 #example of VCF4 generated by 1000G
918 #CHROM POS ID REF ALT QUAL FILTER INFO
919 #1 1000153 . TCACA T 100 PASS AF=0.115095;HP=1;NF=16;NR=13;NS=52;CA=0;DP=615
920 #1 1000906 . CA C 48 PASS AF=0.0772696;HP=1;NF=2;NR=9;NS=51;CA=0;DP=281
921 #1 1000950 rs60561655;-/G CG C 100 PASS AF=0.447771;HP=5;DB;NF=10;NR=20;NS=50;CA=M;DP=291
922 #1 1010786 rs36095298;-/G,mills,venter A AG 100 PASS AF=0.774334;HP=1;DB;NF=21;NR=27;NS=51;CA=0;DP=306
923 #1 1026158 . T TGGGGG 100 PASS AF=0.115637;HP=1;NF=5;NR=2;NS=52;CA=0;DP=591
924
925 #reserved VCF4 sub-fields in the INFO field
926 # * AA ancestral allele
927 # * AC allele count in genotypes, for each ALT allele, in the same order as listed
928 # * AF allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes
929 # * AN total number of alleles in called genotypes
930 # * BQ RMS base quality at this position
931 # * CIGAR cigar string describing how to align an alternate allele to the reference allele
932 # * DB dbSNP membership
933 # * DP combined depth across samples, e.g. DP=154
934 # * END end position of the variant described in this record (esp. for CNVs)
935 # * H2 membership in hapmap2
936 # * MQ RMS mapping quality, e.g. MQ=52
937 # * MQ0 Number of MAPQ == 0 reads covering this record
938 # * NS Number of samples with data
939 # * SB strand bias at this position
940 # * SOMATIC indicates that the record is a somatic mutation, for cancer genomics
941 # * VALIDATED validated by follow-up experiment
942
943
944 #SAMtools/BCFtools specific information
945 #SAMtools/BCFtools may write the following tags in the INFO field in VCF/BCF.
946 #Tag Description
947 #I16 16 integers:
948 #1 #reference Q13 bases on the forward strand 2 #reference Q13 bases on the reverse strand
949 #3 #non-ref Q13 bases on the forward strand 4 #non-ref Q13 bases on the reverse strand
950 #5 sum of reference base qualities 6 sum of squares of reference base qualities
951 #7 sum of non-ref base qualities 8 sum of squares of non-ref base qualities
952 #9 sum of ref mapping qualities 10 sum of squares of ref mapping qualities
953 #11 sum of non-ref mapping qualities 12 sum of squares of non-ref mapping qualities
954 #13 sum of tail distance for ref bases 14 sum of squares of tail distance for ref bases
955 #15 sum of tail distance for non-ref bases 16 sum of squares of tail distance for non-ref
956 #INDEL Indicating the variant is an INDEL.
957 #DP The number of reads covering or bridging POS.
958 #DP4 Number of 1) forward ref alleles; 2) reverse ref; 3) forward non-ref; 4) reverse non-ref alleles, used in variant calling. Sum can be smaller than DP because low-quality bases are not counted.
959 #PV4 P-values for 1) strand bias (exact test); 2) baseQ bias (t-test); 3) mapQ bias (t); 4) tail distance bias (t)
960 #FQ Consensus quality. If positive, FQ equals the phred-scaled probability of there being two or more different alleles. If negative, FQ equals the minus phred-scaled probability of all chromosomes being identical. Notably, given one sample, FQ is positive at hets and negative at homs.
961 #AF1 EM estimate of the site allele frequency of the strongest non-reference allele.
962 #CI95 Equal-tail (Bayesian) credible interval of the site allele frequency at the 95% level.
963 #PC2 Phred-scaled probability of the alternate allele frequency of group1 samples being larger (,smaller) than of group2 samples.
964 #PCHI2 Posterior weighted chi^2 P-value between group1 and group2 samples. This P-value is conservative.
965 #QCHI2 Phred-scaled PCHI2
966 #RP Number of permutations yeilding a smaller PCHI2
967
968 #example of triallelic variants generated by mpileup/bcftools
969 #1 156706559 . A C,G 114 . DP=20;AF1=1;CI95=1,1;DP4=0,0,1,19;MQ=60;FQ=-63 GT:PL:GQ 1/2:237,126,90,162,0,138:99
970 #6 31129642 . A G,C 76 . DP=31;AF1=1;CI95=1,1;DP4=0,0,28,3;MQ=60;FQ=-75 GT:PL:GQ 1/2:255,194,146,164,0,119:99
971 #1 11297762 . T C,A 98 . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78 GT:PL:GQ 1/1:131,51,0,120,28,117:99
972
973 my @field=split(/\t/,$_);
974 @field >=8 or die "Error: invalid record found in VCF4 file (at least 8 tab-delimited fields expected): <$_>\n";
975 my ($chr, $start, $ID, $ref_allele, $mut_allele, $quality_score, $filter, $info, $format, $sample) = @field;
976 my ($end);
977 my ($mut_allele2, $zygosity);
978
979 if ($filterword) { #ensure that the filter field contains the filterword
980 $filter =~ m/\b$filterword\b/i or next;
981 }
982
983
984 #sometimes the alleles are not in the same case
985 #chr1 1869771 1869774 actc aCTctc 43.5 13 INDEL;DP=13;AF1=0.5;CI95=0.5,0.5;DP4=0,4,4,0;MQ=37;PV4=0.029,0.45,1,0.46
986 $ref_allele = uc $ref_allele;
987 $mut_allele = uc $mut_allele;
988
989 #if ($ID eq '.' || $ID =~ /^rs/) { #per MISHIMA, Hiroyuki suggestion (vcf4's third column (ID column) are not always ".")
990 # $end = $start; #this block is commented out on 2011feb19
991 #}
992
993 if ($mut_allele eq '.') { #no variant call was made at this position
994 next;
995 }
996
997 if ($mut_allele =~ m/([^,]+),([^,]+)/) {
998 $mut_allele = $1;
999 $mut_allele2 = $2;
1000 }
1001
1002 if(length($ref_allele)==1 && length($mut_allele)==1) { ### output snv
1003 my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/;
1004 my ($MappingQuality) = $info =~ /MQ=([^;]+)/;
1005 my ($QualityByDepth) = $info =~ /QD=([^;]+)/;
1006
1007
1008
1009 if ($coverage) {
1010 defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next;
1011 if ($maxcoverage) {
1012 defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next;
1013 }
1014 }
1015
1016 if ($snpqual) {
1017 defined $QualityByDepth and $QualityByDepth >= $snpqual || next; #the QD was used here as quality score
1018 }
1019
1020 if (defined $sample) {
1021 if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
1022 $zygosity = 'het';
1023 $counthet++;
1024 } elsif ($sample =~ m#^1/1#) {
1025 $zygosity = 'hom';
1026 $counthom++;
1027 } else {
1028 $zygosity = 'unknown';
1029 $countunknown++;
1030 }
1031 } else {
1032 $zygosity = 'unknown';
1033 $countunknown++;
1034 }
1035
1036 #the subject is called as homozygous for the first alternative allele (genotype 1/1. i.e. C/C), but since there was one read containing A, samtools still keep both alleles in the VCF file (but gives a very low probabilities for it).
1037 #1 11297762 . T C,A 98 . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78 GT:PL:GQ 1/1:131,51,0,120,28,117:99
1038 if ($mut_allele2 and $zygosity eq 'hom') {
1039 $mut_allele2 = '';
1040 }
1041
1042 if (not $mut_allele2) {
1043 if ($ref_allele eq 'A' and $mut_allele eq 'G' or $ref_allele eq 'G' and $mut_allele eq 'A' or $ref_allele eq 'C' and $mut_allele eq 'T' or $ref_allele eq 'T' and $mut_allele eq 'C') {
1044 $countti++;
1045
1046 } else {
1047 $counttv++;
1048 }
1049 }
1050
1051 print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele, "\t$zygosity", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n";
1052
1053 if ($allallele) {
1054 if ($mut_allele2) {
1055 print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele2, "\t$zygosity", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n";
1056 }
1057 }
1058
1059 $countsnp++;
1060 } elsif (length($ref_allele) > 1 || length($mut_allele) > 1) { ### output indel
1061 my ($indel_read_depth1, $indel_read_depth2) = $info =~ /AC=([^,;]+),([^,;]+)/; #number of reads supporting consensus indel, any indel
1062 my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/;
1063
1064 if ($coverage) {
1065 defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next;
1066 if ($maxcoverage) {
1067 defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next;
1068 }
1069 }
1070
1071 if (defined $indel_read_depth1 and defined $unfiltered_read_depth) {
1072 $indel_read_depth1/$unfiltered_read_depth >= $fraction or next; #do not meet minimum alternative allele fraction threshold
1073 $indel_read_depth1/$indel_read_depth2 >= $confraction or next;
1074 }
1075
1076 my ($MappingQuality) = $info =~ /MQ=([^;]+),/;
1077
1078 #example VCF4 records below:
1079 #20 2 . TCG T . PASS DP=100
1080 #Chr1 5473 . AT ATT 23.5 . INDEL;DP=16;AF1=0.5;CI95=0.5,0.5;DP4=4,2,3,1;MQ=42;PV4=1,0.41,0.042,0.24
1081 #Chr1 6498 . ATTTT ATTTTT 53.5 . INDEL;DP=9;AF1=1;CI95=1,1;DP4=0,0,5,3;MQ=28
1082
1083 if(length($ref_allele) > length ($mut_allele)) { # deletion or block substitution
1084 my $head = substr($ref_allele, 0, length ($mut_allele));
1085 if ($head eq $mut_allele) {
1086 print $chr,"\t";
1087 print $start+length($head),"\t";
1088 print $start+length($ref_allele)-1,"\t";
1089
1090 $ref_allele = substr ($ref_allele, length ($mut_allele));
1091 print $ref_allele,"\t";
1092 print "-";
1093 } else {
1094 print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t";
1095 }
1096 } elsif(length($mut_allele) >= length ($ref_allele)) { # insertion or block substitution
1097 my $head = substr ($mut_allele, 0, length ($ref_allele));
1098 if ($head eq $ref_allele) {
1099 print $chr,"\t";
1100 print $start+length($ref_allele)-1,"\t";
1101 print $start+length($ref_allele)-1,"\t";
1102
1103 $mut_allele = substr ($mut_allele, length ($ref_allele));
1104 print "-\t";
1105 print $mut_allele;
1106 } else {
1107 print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t";
1108 }
1109 }
1110
1111 if (defined $sample) {
1112 if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
1113 print "\thet";
1114 $counthet++;
1115 } elsif ($sample =~ m#^1/1#) {
1116 print "\thom";
1117 $counthom++;
1118 } # BEGIN ARQ
1119 elsif ($sample =~ m#^./.#) {
1120 print "\tunknown";
1121 $countunknown++;
1122 } # END ARQ
1123 }
1124
1125 print "\t", $quality_score;
1126 defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth;
1127
1128 defined $indel_read_depth1 and print "\t", $indel_read_depth1;
1129 defined $MappingQuality and print "\t", $MappingQuality;
1130 $includeinfo and print "\t", $otherinfo;
1131 print "\n";
1132 $countindel++;
1133
1134
1135 #do the same thing again, exactly like above, except that we work on second mutation;
1136 #in the future, consider rewrite this paragraph to make the code more elegant
1137 if ($allallele) {
1138 if ($mut_allele2) {
1139 if(length($ref_allele) > length ($mut_allele2)) { # deletion or block substitution
1140 my $head = substr($ref_allele, 0, length ($mut_allele2));
1141 if ($head eq $mut_allele2) {
1142 print $chr,"\t";
1143 print $start+length($head),"\t";
1144 print $start+length($ref_allele)-1,"\t";
1145
1146 $ref_allele = substr ($ref_allele, length ($mut_allele2));
1147 print $ref_allele,"\t";
1148 print "-";
1149 } else {
1150 print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2;
1151 }
1152 } elsif(length($mut_allele2) > length ($ref_allele)) { # insertion or block substitution
1153 my $head = substr ($mut_allele2, 0, length ($ref_allele));
1154 if ($head eq $ref_allele) {
1155 print $chr,"\t";
1156 print $start+length($ref_allele)-1,"\t";
1157 print $start+length($ref_allele)-1,"\t";
1158
1159 $mut_allele = substr ($mut_allele2, length ($ref_allele));
1160 print "-\t";
1161 print $mut_allele2;
1162 } else {
1163 print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2;
1164 }
1165 }
1166
1167 if (defined $sample) {
1168 if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
1169 print "\thet";
1170 $counthet++;
1171 } elsif ($sample =~ m#^1/1#) {
1172 print "\thom";
1173 $counthom++;
1174 } # BEGIN ARQ
1175 elsif ($sample =~ m#^./.#) {
1176 print "\tunknown";
1177 $countunknown++;
1178 } # END ARQ
1179 }
1180
1181 print "\t", $quality_score;
1182 defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth;
1183
1184 defined $indel_read_depth1 and print "\t", $indel_read_depth1;
1185 defined $MappingQuality and print "\t", $MappingQuality;
1186 $includeinfo and print "\t", $otherinfo;
1187 print "\n";
1188
1189 }
1190 }
1191 }
1192 $countvar++;
1193 }
1194 my $triallelic = $countsnp-$countti-$counttv;
1195 print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n";
1196 print STDERR "NOTICE: Among ${\($counthet+$counthom+$countunknown)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n";
1197 print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n";
1198 }
1199
1200
1201 =head1 SYNOPSIS
1202
1203 convert2annovar.pl [arguments] <variantfile>
1204
1205 Optional arguments:
1206 -h, --help print help message
1207 -m, --man print complete documentation
1208 -v, --verbose use verbose output
1209 --format <string> input format (default: pileup)
1210 --outfile <file> output file name (default: STDOUT)
1211 --snpqual <float> quality score threshold in pileup file (default: 20)
1212 --snppvalue <float> SNP P-value threshold in GFF3-SOLiD file (default: 1)
1213 --coverage <int> read coverage threshold in pileup file (default: 0)
1214 --maxcoverage <int> maximum coverage threshold (default: none)
1215 --includeinfo include supporting information in output
1216 --chr <string> specify the chromosome (for CASAVA format)
1217 --chrmt <string> chr identifier for mitochondria (default: M)
1218 --altcov <int> alternative allele coverage threshold (for pileup format)
1219 --fraction <float> minimum allelic fraction to claim a mutation (for pileup/vcf4_indel format)
1220 --species <string> if human, convert chr23/24/25 to X/Y/M (for gff3-solid format)
1221 --filter <string> output variants with this filter (case insensitive, for vcf4 format)
1222 --confraction <float> minimum consensus indel / all indel fraction (for vcf4_indel format)
1223 --allallele print all alleles when multiple calls are present (for vcf4 format)
1224
1225 Function: convert variant call file generated from various software programs
1226 into ANNOVAR input format
1227
1228 Example: convert2annovar.pl -format pileup -outfile variant.query variant.pileup
1229 convert2annovar.pl -format cg -outfile variant.query variant.cg
1230 convert2annovar.pl -format gff3-solid -outfile variant.query variant.snp.gff
1231 convert2annovar.pl -format soap variant.snp > variant.avinput
1232 convert2annovar.pl -format maq variant.snp > variant.avinput
1233 convert2annovar.pl -format casava -chr 1 variant.snp > variant.avinput
1234 convert2annovar.pl -format vcf4 variantfile > variant.avinput
1235 convert2annovar.pl -format vcf4 -filter pass variantfile > variant.avinput
1236
1237 Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $
1238
1239 =head1 OPTIONS
1240
1241 =over 8
1242
1243 =item B<--help>
1244
1245 print a brief usage message and detailed explanation of options.
1246
1247 =item B<--man>
1248
1249 print the complete manual of the program.
1250
1251 =item B<--verbose>
1252
1253 use verbose output.
1254
1255 =item B<--format>
1256
1257 the format of the input files.
1258
1259 =item B<--outfile>
1260
1261 specify the output file name. By default, output is written to STDOUT.
1262
1263 =item B<--snpqual>
1264
1265 quality score threshold in the pileup file, such that variant calls with lower
1266 quality scores will not be printed out in the output file. When VCF4 file is
1267 used, this argument works on the Quality-by-Depth measure, rather than the raw
1268 quality measure.
1269
1270 =item B<--coverage>
1271
1272 read coverage threshold in the pileup file, such that variants calls generated
1273 with lower coverage will not be printed in the output file.
1274
1275 =item B<--includeinfo>
1276
1277 specify that the output should contain additional information in the input line.
1278 By default, only the chr, start, end, reference allele, observed allele and
1279 homozygosity status are included in output files.
1280
1281 =item B<--chr>
1282
1283 specify the chromosome for CASAVA format
1284
1285 =item B<--chrmt>
1286
1287 specify the name of mitochondria chromosome (default is MT)
1288
1289 =item B<--altcov>
1290
1291 the minimum coverage of the alternative (mutated) allele to be printed out in
1292 output
1293
1294 =item B<--fraction>
1295
1296 specify the minimum fraction of alternative allele, to print out the mutation.
1297 For example, a site has 10 reads, 3 supports alternative allele. A -fraction of
1298 0.4 will not allow the mutation to be printed out.
1299
1300 =item B<--species>
1301
1302 specify the species from which the sequencing data is obtained. For the GFF3-
1303 SOLiD format, when species is human, the chromosome 23, 24 and 25 will be
1304 converted to X, Y and M, respectively.
1305
1306 =item B<--filter>
1307
1308 for VCF4 file, only print out variant calls with this filter annotated. For
1309 example, if using GATK VariantFiltration walker, you will see PASS,
1310 GATKStandard, HARD_TO_VALIDATE, etc in the filter field. Using 'pass' as a
1311 filter is recommended in this case.
1312
1313 =item B<--confraction>
1314
1315 consesus indel fraction, calculated as reads supporting consensus indels divided
1316 by reads supporting any indels
1317
1318 =item B<--allallele>
1319
1320 print all alleles for mutations at a locus, rather than the first allele, if the
1321 input VCF4 file contains multiple alternative alleles for a mutation. By
1322 default, this option is off. When it is on, two lines will be printed out in the
1323 output, and both will have the same quality scores as VCF4 does not provide
1324 separate quality scores for individual alleles.
1325
1326 =back
1327
1328 =head1 DESCRIPTION
1329
1330 This program is used to convert variant call file generated from various
1331 software programs into ANNOVAR input format. Currently, the program can handle
1332 Samtools genotype-calling pileup format, Solid GFF format, Complete Genomics
1333 variant format, SOAP format. These formats are described below.
1334
1335 =over 8
1336
1337 =item * B<pileup format>
1338
1339 The pileup format can be produced by the Samtools genotyping calling subroutine.
1340 Note that the phrase pileup format can be used in several instances, and here I
1341 am only referring to the pileup files that contains the actual genotype calls.
1342
1343 Using SamTools, given an alignment file in BAM format, a pileup file with
1344 genotype calls can be produced by the command below:
1345
1346 samtools pileup -vcf ref.fa aln.bam> raw.pileup
1347 samtools.pl varFilter raw.pileup > final.pileup
1348
1349 ANNOVAR will automatically filter the pileup file so that only SNPs reaching a
1350 quality threshold are printed out (default is 20, use --snpqual argument to
1351 change this). Most likely, users may want to also apply a coverage threshold,
1352 such that SNPs calls from only a few reads are not considered. This can be
1353 achieved using the -coverage argument (default value is 0).
1354
1355 An example of pileup files for SNPs is shown below:
1356
1357 chr1 556674 G G 54 0 60 16 a,.....,...,.... (B%A+%7B;0;%=B<:
1358 chr1 556675 C C 55 0 60 16 ,,..A..,...,.... CB%%5%,A/+,%....
1359 chr1 556676 C C 59 0 60 16 g,.....,...,.... .B%%.%.?.=/%...1
1360 chr1 556677 G G 75 0 60 16 ,$,.....,...,.... .B%%9%5A6?)%;?:<
1361 chr1 556678 G K 60 60 60 24 ,$.....,...,....^~t^~t^~t^~t^~t^~t^~t^~t^~t B%%B%<A;AA%??<=??;BA%B89
1362 chr1 556679 C C 61 0 60 23 .....a...a....,,,,,,,,, %%1%&?*:2%*&)(89/1A@B@@
1363 chr1 556680 G K 88 93 60 23 ..A..,..A,....ttttttttt %%)%7B:B0%55:7=>>A@B?B;
1364 chr1 556681 C C 102 0 60 25 .$....,...,....,,,,,,,,,^~,^~. %%3%.B*4.%.34.6./B=?@@>5.
1365 chr1 556682 A A 70 0 60 24 ...C,...,....,,,,,,,,,,. %:%(B:A4%7A?;A><<999=<<
1366 chr1 556683 G G 99 0 60 24 ....,...,....,,,,,,,,,,. %A%3B@%?%C?AB@BB/./-1A7?
1367
1368 The columns are chromosome, 1-based coordinate, reference base, consensus base,
1369 consensus quality, SNP quality, maximum mapping quality of the reads covering
1370 the sites, the number of reads covering the site, read bases and base qualities.
1371
1372 An example of pileup files for indels is shown below:
1373
1374 seq2 156 * +AG/+AG 71 252 99 11 +AG * 3 8 0
1375
1376 ANNOVAR automatically recognizes both SNPs and indels in pileup file, and process them correctly.
1377
1378 =item * B<GFF3-SOLiD format>
1379
1380 The SOLiD provides a GFF3-compatible format for SNPs, indels and structural
1381 variants. A typical example file is given below:
1382
1383 ##gff-version 3
1384 ##solid-gff-version 0.3
1385 ##source-version 2
1386 ##type DNA
1387 ##date 2009-03-13
1388 ##time 0:0:0
1389 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
1390 ##reference-file
1391 ##input-files Yoruban_snp_10x.txt
1392 ##run-path
1393 chr_name AB_SOLiD SNP caller SNP coord coord 1 . . coverage=# cov;ref_base=ref;ref_score=score;ref_confi=confi;ref_single=Single;ref_paired=Paired;consen_base=consen;consen_score=score;consen_confi=conf;consen_single=Single;consen_paired=Paired;rs_id=rs_id,dbSNP129
1394 1 AB_SOLiD SNP caller SNP 997 997 1 . . coverage=3;ref_base=A;ref_score=0.3284;ref_confi=0.9142;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6716;consen_confi=0.9349;consen_single=0/0;consen_paired=2/2
1395 1 AB_SOLiD SNP caller SNP 2061 2061 1 . . coverage=2;ref_base=G;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.8985;consen_single=0/0;consen_paired=2/2
1396 1 AB_SOLiD SNP caller SNP 4770 4770 1 . . coverage=2;ref_base=A;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=G;consen_score=1.0000;consen_confi=0.8854;consen_single=0/0;consen_paired=2/2
1397 1 AB_SOLiD SNP caller SNP 4793 4793 1 . . coverage=14;ref_base=A;ref_score=0.0723;ref_confi=0.8746;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6549;consen_confi=0.8798;consen_single=0/0;consen_paired=9/9
1398 1 AB_SOLiD SNP caller SNP 6241 6241 1 . . coverage=2;ref_base=T;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.7839;consen_single=0/0;consen_paired=2/2
1399
1400 Newer version of ABI BioScope now use diBayes caller, and the output file is given below:
1401
1402 ##gff-version 3
1403 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
1404 ##List of SNPs. Date Sat Dec 18 10:30:45 2010 Stringency: medium Mate Pair: 1 Read Length: 50 Polymorphism Rate: 0.003000 Bayes Coverage: 60 Bayes_Single_SNP: 1 Filter_Single_SNP: 1 Quick_P_Threshold: 0.997000 Bayes_P_Threshold: 0.040000 Minimum_Allele_Ratio: 0.150000 Minimum_Allele_Ratio_Multiple_of_Dicolor_Error: 100
1405 ##1 chr1
1406 ##2 chr2
1407 ##3 chr3
1408 ##4 chr4
1409 ##5 chr5
1410 ##6 chr6
1411 ##7 chr7
1412 ##8 chr8
1413 ##9 chr9
1414 ##10 chr10
1415 ##11 chr11
1416 ##12 chr12
1417 ##13 chr13
1418 ##14 chr14
1419 ##15 chr15
1420 ##16 chr16
1421 ##17 chr17
1422 ##18 chr18
1423 ##19 chr19
1424 ##20 chr20
1425 ##21 chr21
1426 ##22 chr22
1427 ##23 chrX
1428 ##24 chrY
1429 ##25 chrM
1430 # source-version SOLiD BioScope diBayes(SNP caller)
1431 #Chr Source Type Pos_Start Pos_End Score Strand Phase Attributes
1432 chr1 SOLiD_diBayes SNP 221367 221367 0.091151 . . genotype=R;reference=G;coverage=3;refAlleleCounts=1;refAlleleStarts=1;refAlleleMeanQV=29;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=27;diColor1=11;diColor2=33;het=1;flag=
1433 chr1 SOLiD_diBayes SNP 555317 555317 0.095188 . . genotype=Y;reference=T;coverage=13;refAlleleCounts=11;refAlleleStarts=10;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=00;diColor2=22;het=1;flag=
1434 chr1 SOLiD_diBayes SNP 555327 555327 0.037582 . . genotype=Y;reference=T;coverage=12;refAlleleCounts=6;refAlleleStarts=6;refAlleleMeanQV=19;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=12;diColor2=30;het=1;flag=
1435 chr1 SOLiD_diBayes SNP 559817 559817 0.094413 . . genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag=
1436 chr1 SOLiD_diBayes SNP 714068 714068 0.000000 . . genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag=
1437 The file conforms to standard GFF3 specifications, but the last column is solid-
1438 specific and it gives certain parameters for the SNP calls.
1439
1440 An example of the short indel format by GFF3-SOLiD is given below:
1441
1442 ##gff-version 3
1443 ##solid-gff-version 0.3
1444 ##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49
1445 ##type DNA
1446 ##date 2009-01-26
1447 ##time 18:33:20
1448 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
1449 ##reference-file
1450 ##input-files ../../mp-results/JOAN_20080104_1.pas,../../mp-results/BARB_20071114_1.pas,../../mp-results/BARB_20080227_2.pas
1451 ##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-w2x25-2x-4x-8x-10x/2x
1452 ##Filter-settings: max-ave-read-pos=none,min-ave-from-end-pos=9.1,max-nonreds-4filt=2,min-insertion-size=none,min-deletion-size=none,max-insertion-size=none,max-deletion-size=none,require-called-indel-size?=T
1453 chr1 AB_SOLiD Small Indel Tool deletion 824501 824501 1 . . del_len=1;tight_chrom_pos=824501-824502;loose_chrom_pos=824501-824502;no_nonred_reads=2;no_mismatches=1,0;read_pos=4,6;from_end_pos=21,19;strands=+,-;tags=R3,F3;indel_sizes=-1,-1;read_seqs=G3021212231123203300032223,T3321132212120222323222101;dbSNP=rs34941678,chr1:824502-824502(-),EXACT,1,/GG
1454 chr1 AB_SOLiD Small Indel Tool insertion_site 1118641 1118641 1 . . ins_len=3;tight_chrom_pos=1118641-1118642;loose_chrom_pos=1118641-1118642;no_nonred_reads=2;no_mismatches=0,1;read_pos=17,6;from_end_pos=8,19;strands=+,+;tags=F3,R3;indel_sizes=3,3;read_seqs=T0033001100022331122033112,G3233112203311220000001002
1455
1456 The keyword deletion or insertion_site is used in the fourth column to indicate
1457 that file format.
1458
1459 An example of the medium CNV format by GFF3-SOLiD is given below:
1460
1461 ##gff-version 3
1462 ##solid-gff-version 0.3
1463 ##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49
1464 ##type DNA
1465 ##date 2009-01-27
1466 ##time 15:54:36
1467 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
1468 ##reference-file
1469 ##input-files big_d20e5-del12n_up-ConsGrp-2nonred.pas.sum
1470 ##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-results-lmp-e5/big_d20e5-indel_950_2050
1471 chr1 AB_SOLiD Small Indel Tool deletion 3087770 3087831 1 . . del_len=62;tight_chrom_pos=none;loose_chrom_pos=3087768-3087773;no_nonred_reads=2;no_mismatches=2,2;read_pos=27,24;from_end_pos=23,26;strands=-,+;tags=F3,F3;indel_sizes=-62,-62;read_seqs=T11113022103331111130221213201111302212132011113022,T02203111102312122031111023121220311111333012203111
1472 chr1 AB_SOLiD Small Indel Tool deletion 4104535 4104584 1 . . del_len=50;tight_chrom_pos=4104534-4104537;loose_chrom_pos=4104528-4104545;no_nonred_reads=3;no_mismatches=0,4,4;read_pos=19,19,27;from_end_pos=31,31,23;strands=+,+,-;tags=F3,R3,R3;indel_sizes=-50,-50,-50;read_seqs=T31011011013211110130332130332132110110132020312332,G21031011013211112130332130332132110132132020312332,G20321302023001101123123303103303101113231011011011
1473 chr1 AB_SOLiD Small Indel Tool insertion_site 2044888 2044888 1 . . ins_len=18;tight_chrom_pos=2044887-2044888;loose_chrom_pos=2044887-2044889;no_nonred_reads=2;bead_ids=1217_1811_209,1316_908_1346;no_mismatches=0,2;read_pos=13,15;from_end_pos=37,35;strands=-,-;tags=F3,F3;indel_sizes=18,18;read_seqs=T31002301231011013121000101233323031121002301231011,T11121002301231011013121000101233323031121000101231;non_indel_no_mismatches=3,1;non_indel_seqs=NIL,NIL
1474 chr1 AB_SOLiD Small Indel Tool insertion_site 74832565 74832565 1 . . ins_len=16;tight_chrom_pos=74832545-74832565;loose_chrom_pos=74832545-74832565;no_nonred_reads=2;bead_ids=1795_181_514,1651_740_519;no_mismatches=0,2;read_pos=13,13;from_end_pos=37,37;strands=-,-;tags=F3,R3;indel_sizes=16,16;read_seqs=T33311111111111111111111111111111111111111111111111,G23311111111111111111111111111111111111111311011111;non_indel_no_mismatches=1,0;non_indel_seqs=NIL,NIL
1475
1476 An example of the large indel format by GFF3-SOLiD is given below:
1477
1478 ##gff-version 3
1479 ##solid-gff-version 0.3
1480 ##source-version ???
1481 ##type DNA
1482 ##date 2009-03-13
1483 ##time 0:0:0
1484 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
1485 ##reference-file
1486 ##input-files /data/results5/yoruban_strikes_back_large_indels/LMP/five_mm_unique_hits_no_rescue/5_point_6x_del_lib_1/results/NA18507_inter_read_indels_5_point_6x.dat
1487 ##run-path
1488 chr1 AB_SOLiD Large Indel Tool insertion_site 1307279 1307791 1 . . deviation=-742;stddev=7.18;ref_clones=-;dev_clones=4
1489 chr1 AB_SOLiD Large Indel Tool insertion_site 2042742 2042861 1 . . deviation=-933;stddev=8.14;ref_clones=-;dev_clones=3
1490 chr1 AB_SOLiD Large Indel Tool insertion_site 2443482 2444342 1 . . deviation=-547;stddev=11.36;ref_clones=-;dev_clones=17
1491 chr1 AB_SOLiD Large Indel Tool insertion_site 2932046 2932984 1 . . deviation=-329;stddev=6.07;ref_clones=-;dev_clones=14
1492 chr1 AB_SOLiD Large Indel Tool insertion_site 3166925 3167584 1 . . deviation=-752;stddev=13.81;ref_clones=-;dev_clones=14
1493
1494 An example of the CNV format by GFF3-SOLiD if given below:
1495
1496 ##gff-version 3
1497 ##solid-gff-version 0.3
1498 ##source-version ???
1499 ##type DNA
1500 ##date 2009-03-13
1501 ##time 0:0:0
1502 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
1503 ##reference-file
1504 ##input-files Yoruban_cnv.coords
1505 ##run-path
1506 chr1 AB_CNV_PIPELINE repeat_region 1062939 1066829 . . . fraction_mappable=51.400002;logratio=-1.039300;copynum=1;numwindows=1
1507 chr1 AB_CNV_PIPELINE repeat_region 1073630 1078667 . . . fraction_mappable=81.000000;logratio=-1.409500;copynum=1;numwindows=2
1508 chr1 AB_CNV_PIPELINE repeat_region 2148325 2150352 . . . fraction_mappable=98.699997;logratio=-1.055000;copynum=1;numwindows=1
1509 chr1 AB_CNV_PIPELINE repeat_region 2245558 2248109 . . . fraction_mappable=78.400002;logratio=-1.042900;copynum=1;numwindows=1
1510 chr1 AB_CNV_PIPELINE repeat_region 3489252 3492632 . . . fraction_mappable=59.200001;logratio=-1.119900;copynum=1;numwindows=1
1511 chr1 AB_CNV_PIPELINE repeat_region 5654415 5657276 . . . fraction_mappable=69.900002;logratio=1.114500;copynum=4;numwindows=1
1512 chr1 AB_CNV_PIPELINE repeat_region 9516165 9522726 . . . fraction_mappable=65.850006;logratio=-1.316700;numwindows=2
1513 chr1 AB_CNV_PIPELINE repeat_region 16795117 16841025 . . . fraction_mappable=44.600002;logratio=1.880778;copynum=7;numwindows=9
1514
1515 The keyword repeat_region is used here, although it actually refers to CNVs.
1516
1517 An example of the inversion format by GFF3-SOLiD is given below:
1518
1519 ##gff-version 3
1520 ##solid-gff-version 0.2
1521 ##generated by SOLiD inversion tool
1522 chr10 AB_SOLiD inversion 46443107 46479585 268.9 . . left=chr10:46443107-46443146;right=chr10:46479583-46479585;leftscore=295.0;rightscore=247.0;count_AAA_further_left=117;count_AAA_left=3;count_AAA_right=3;count_AAA_further_right=97;left_min_count_AAA=chr10:46443107-46443112;count_AAA_min_left=0;count_AAA_max_left=3;right_min_count_AAA=chr10:46479585-46479585;count_AAA_min_right=1;count_AAA_max_right=3;homozygous=UNKNOWN
1523 chr4 AB_SOLiD inversion 190822813 190850112 214.7 . . left=chr4:190822813-190822922;right=chr4:190850110-190850112;leftscore=140.0;rightscore=460.0;count_AAA_further_left=110;count_AAA_left=78;count_AAA_right=74;count_AAA_further_right=77;left_min_count_AAA=chr4:190822813-190822814;count_AAA_min_left=69;count_AAA_max_left=77;right_min_count_AAA=chr4:190850110-190850112;count_AAA_min_right=74;count_AAA_max_right=74;homozygous=NO
1524 chr6 AB_SOLiD inversion 168834969 168837154 175.3 . . left=chr6:168834969-168835496;right=chr6:168836643-168837154;leftscore=185.4;rightscore=166.2;count_AAA_further_left=67;count_AAA_left=43;count_AAA_right=40;count_AAA_further_right=59;left_min_count_AAA=chr6:168835058-168835124,chr6:168835143-168835161,chr6:168835176-168835181,chr6:168835231-168835262;count_AAA_min_left=23;count_AAA_max_left=29;right_min_count_AAA=chr6:168836643-168836652;count_AAA_min_right=23;count_AAA_max_right=31;homozygous=NO
1525
1526 The program should be able to recognize all the above GFF3-SOLiD format
1527 automatically, and handle them accordingly.
1528
1529 =item * B<Complete Genomics format>
1530
1531 This format is provided by the Complete Genomics company to their customers. The
1532 file var-[ASM-ID].tsv.bz2 includes a description of all loci where the assembled
1533 genome differs from the reference genome.
1534
1535 An example of the Complete Genomics format is shown below:
1536
1537 #BUILD 1.5.0.5
1538 #GENERATED_AT 2009-Nov-03 19:52:21.722927
1539 #GENERATED_BY dbsnptool
1540 #TYPE VAR-ANNOTATION
1541 #VAR_ANN_SET /Proj/Pipeline/Production_Data/REF/HUMAN-F_06-REF/dbSNP.csv
1542 #VAR_ANN_TYPE dbSNP
1543 #VERSION 0.3
1544
1545 >locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
1546 1 2 all chr1 0 959 no-call = ?
1547 2 2 all chr1 959 972 = = =
1548 3 2 all chr1 972 1001 no-call = ?
1549 4 2 all chr1 1001 1008 = = =
1550 5 2 all chr1 1008 1114 no-call = ?
1551 6 2 all chr1 1114 1125 = = =
1552 7 2 all chr1 1125 1191 no-call = ?
1553 8 2 all chr1 1191 1225 = = =
1554 9 2 all chr1 1225 1258 no-call = ?
1555 10 2 all chr1 1258 1267 = = =
1556 12 2 all chr1 1267 1275 no-call = ?
1557 13 2 all chr1 1275 1316 = = =
1558 14 2 all chr1 1316 1346 no-call = ?
1559 15 2 all chr1 1346 1367 = = =
1560 16 2 all chr1 1367 1374 no-call = ?
1561 17 2 all chr1 1374 1388 = = =
1562 18 2 all chr1 1388 1431 no-call = ?
1563 19 2 all chr1 1431 1447 = = =
1564 20 2 all chr1 1447 1454 no-call = ?
1565
1566 The following information is provided in documentation from Complete Genomics, that describes the var-ASM format.
1567
1568 1. locus. Identifier of a particular genomic locus
1569 2. ploidy. The ploidy of the reference genome at the locus (= 2 for autosomes, 2 for pseudoautosomal regions on the sex chromosomes, 1 for males on the non-pseudoautosomal parts of the sex chromosomes, 1 for mitochondrion, '?' if varType is 'no-ref' or 'PAR-called-in-X'). The reported ploidy is fully determined by gender, chromosome and location, and is not inferred from the sequence data.
1570 3. haplotype. Identifier for each haplotype at the variation locus. For diploid genomes, 1 or 2. Shorthand of 'all' is allowed where the varType field is one of 'ref', 'no-call', 'no-ref', or 'PAR-called-in-X'. Haplotype numbering does not imply phasing; haplotype 1 in locus 1 is not necessarily in phase with haplotype 1 in locus 2. See hapLink, below, for phasing information.
1571 4. chromosome. Chromosome name in text: 'chr1','chr2', ... ,'chr22','chrX','chrY'. The mitochondrion is represented as 'chrM'. The pseudoautosomal regions within the sex chromosomes X and Y are reported at their coordinates on chromosome X.
1572 5. begin. Reference coordinate specifying the start of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information.
1573 6. end. Reference coordinate specifying the end of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information.
1574 7. varType. Type of variation, currently one of:
1575 snp: single-nucleotide polymorphism
1576 ins: insertion
1577 del: deletion
1578 sub: Substitution of one or more reference bases with the bases in the allele column
1579 'ref' : no variation; the sequence is identical to the reference sequence on the indicated haplotype
1580 no-call-rc: 'no-call reference consistent 'one or more bases are ambiguous, but the allele is potentially consistent with the reference
1581 no-call-ri: 'no-call reference inconsistent' one or more bases are ambiguous, but the allele is definitely inconsistent with the reference
1582 no-call: an allele is completely indeterminate in length and composition, i.e. alleleSeq = '?'
1583 no-ref: the reference sequence is unspecified at this locus.
1584 PAR-called-in-X: this locus overlaps one of the pseudoautosomal regions on the sex chromosomes. The called sequence is reported as diploid sequence on Chromosome X; on chromosome Y the sequence is reported as varType = 'PAR-called-in-X'.
1585 8. reference. The reference sequence for the locus of variation. Empty when varType is ins. A value of '=' indicates that the user must consult the reference for the sequence; this shorthand is only used in regions where no haplotype deviates from the reference sequence.
1586 9. alleleSeq. The observed sequence at the locus of variation. Empty when varType is del. '?' isused to indicate 0 or more unknown bases within the sequence; 'N' is used to indicate exactly one unknown base within the sequence.'=' is used as shorthand to indicate identity to the reference sequence for non-variant sequence, i.e. when varType is 'ref'.
1587 10. totalScore. A score corresponding to a single variation and haplotype, representing the confidence in the call.
1588 11. hapLink. Identifier that links a haplotype at one locus to haplotypes at other loci. Currently only populated for very proximate variations that were assembled together. Two calls that share a hapLink identifier are expected to be on the same haplotype,
1589 12. xRef. Field containing external variation identifiers, currently only populated for variations corroborated directly by dbSNP. Format: dbsnp:[rsID], with multiple entries separated by the semicolon (;).
1590
1591 In older versions of the format specification, the sub keyword used to be insdel
1592 keyword. ANNOVAR takes care of this.
1593
1594 =item * B<SOAPsnp format>
1595
1596 An example of the SOAP SNP caller format is shown below:
1597
1598 chr8 35782 A R 1 A 27 1 2 G 26 1 2 5 0.500000 2.00000 1 5
1599 chr8 35787 G R 0 G 25 4 6 A 17 2 4 10 0.266667 1.60000 0 5
1600
1601 The following information is provided in documentation from BGI who developed
1602 SOAP suite. It differs slightly from the description at the SOAPsnp website, and
1603 presumably the website is outdated.
1604
1605 Format description:(left to right)
1606 1. Chromosome name
1607 2. Position of locus
1608 3. Nucleotide at corresponding locus of reference sequence
1609 4. Genotype of sequencing sample
1610 5. Quality value
1611 6. nucleotide with the highest probability(first nucleotide)
1612 7. Quality value of the nucleotide with the highest probability
1613 8. Number of supported reads that can only be aligned to this locus
1614 9. Number of all supported reads that can be aligned to this locus
1615 10. Nucleotide with higher probability
1616 11. Quality value of nucleotide with higher probability
1617 12. Number of supported reads that can only be aligned to this locus
1618 13. Number of all supported reads that can be aligned to this locus
1619 14. Total number of reads that can be aligned to this locus
1620 15. Order and quality value
1621 16. Estimated copy number for this locus
1622 17. Presence of this locus in the dbSNP database. 1 refers to presence and 0 refers to inexistence
1623 18. The distance between this locus and another closest SNP
1624
1625 =item * B<SOAPindel format>
1626
1627 The current version of ANNOVAR handles SoapSNP and SoapIndel automatically via a
1628 single argument '--format soap'. An example of SOAP indel caller format is shown
1629 below:
1630
1631 chr11 44061282 - +2 CT Hete
1632 chr11 45901572 + +1 C Hete
1633 chr11 48242562 * -3 TTC Homo
1634 chr11 57228723 * +4 CTTT Homo
1635 chr11 57228734 * +4 CTTT Homo
1636 chr11 57555685 * -1 C Hete
1637 chr11 61482191 - +3 TCC Hete
1638 chr11 64608031 * -1 T Homo
1639 chr11 64654936 * +1 C Homo
1640 chr11 71188303 + -1 T Hete
1641 chr11 75741034 + +1 T Hete
1642 chr11 76632438 * +1 A Hete
1643 chr11 89578266 * -2 AG Homo
1644 chr11 104383261 * +1 T Hete
1645 chr11 124125940 + +4 CCCC Hete
1646 chr12 7760052 * +1 T Homo
1647 chr12 8266049 * +3 ACG Homo
1648
1649 I do not see a documentation describing this format yet as of September 2010.
1650
1651 =item B<--SOAPsv format>
1652
1653 An example is given below:
1654
1655 Chr2 Deletion 42894 43832 43167 43555 388 0-0-0 FR 41
1656
1657 An explanation of the structural variation format is given below:
1658
1659 Format description (from left to right)
1660 1. Chromosome name
1661 2. Type of structure variation
1662 3. Minimal value of start position in cluster
1663 4. Maximal value of end position in cluster
1664 5. Estimated start position of this structure variation
1665 6. Estimated end position of this structure variation
1666 7. Length of SV
1667 8. Breakpoint of SV (only for insertion)
1668 9. Unusual matching mode (F refers to align with forward sequence, R refers
1669 to align with reverse
1670 sequence)
1671 10. number of paired-end read which support this structure variation
1672
1673 =item * B<MAQ format>
1674
1675 MAQ can perform alignment and generate genotype calls, including SNP calls and
1676 indel calls. The format is described below:
1677
1678 For indel header: The output is TAB delimited with each line consisting of chromosome, start
1679 position, type of the indel, number of reads across the indel, size of the indel
1680 and inserted/deleted nucleotides (separated by colon), number of indels on the
1681 reverse strand, number of indels on the forward strand, 5' sequence ahead of the
1682 indel, 3' sequence following the indel, number of reads aligned without indels
1683 and three additional columns for filters.
1684
1685 An example is below:
1686
1687 chr10 110583 - 2 -2:AG 0 1 GCGAGACTCAGTATCAAAAAAAAAAAAAAAAA AGAAAGAAAGAAAAAGAAAAAAATAGAAAGAA 1 @2, @72, @0,
1688 chr10 120134 - 8 -2:CA 0 1 CTCTTGCCCGCTCACACATGTACACACACGCG CACACACACACACACACATCAGCTACCTACCT 7 @65,62,61,61,45,22,7, @9,12,13,13,29,52,67, @0,0,0,0,0,0,0,
1689 chr10 129630 - 1 -1:T 1 0 ATGTTGTGACTCTTAATGGATAAGTTCAGTCA TTTTTTTTTAGCTTTTAACCGGACAAAAAAAG 0 @ @ @
1690 chr10 150209 - 1 4:TTCC 1 0 GCATATAGGGATGGGCACTTTACCTTTCTTTT TTCCTTCCTTCCTTCCTTCCCTTTCCTTTCCT 0 @ @ @
1691 chr10 150244 - 2 -4:TTCT 0 1 CTTCCTTCCTTCCTTCCCTTTCCTTTCCTTTC TTCTTTCTTTCTTTCTTTCTTTTTTTTTTTTT 0 @ @ @
1692 chr10 159622 - 1 3:AGG 0 1 GAAGGAGGAAGGACGGAAGGAGGAAGGAAGGA AGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGA 0 @ @ @
1693 chr10 206372 - 2 2:GT 1 0 ATAATAGTAACTGTGTATTTGATTATGTGTGC GTGTGTGTGTGTGTGTGTGTGTGTGCGTGCTT 1 @37, @37, @8,
1694 chr10 245751 - 11 -1:C 0 1 CTCATAAATACAAGTCATAATGAAAGAAATTA CCACCATTTTCTTATTTTCATTCATTTTTAGT 10 @69,64,53,41,30,25,22,14,5,4, @5,10,21,33,44,49,52,60,69,70, @0,0,0,0,0,0,0,0,0,0,
1695 chr10 253066 - 1 2:TT 0 1 TATTGATGAGGGTGGATTATACTTTAGAACAC TATTCAAACAGTTCTTCCACATATCTCCCTTT 0 @ @ @
1696 chr10 253455 - 2 -3:AAA 1 0 GTTGCACTCCAGCCTGGCGAGATTCTGTCTCC AAAAAAAAAAAAAAAAATTGTTGTGAAATACA 1 @55, @19, @4,
1697
1698 For snp output file: Each line consists of chromosome, position, reference base,
1699 consensus base, Phred-like consensus quality, read depth, the average number of
1700 hits of reads covering this position, the highest mapping quality of the reads
1701 covering the position, the minimum consensus quality in the 3bp flanking regions
1702 at each side of the site (6bp in total), the second best call, log likelihood
1703 ratio of the second best and the third best call, and the third best call.
1704
1705 An example is below:
1706
1707 chr10 83603 C T 28 12 2.81 63 34 Y 26 C
1708 chr10 83945 G R 59 61 4.75 63 62 A 47 G
1709 chr10 83978 G R 47 40 3.31 63 62 A 21 G
1710 chr10 84026 G R 89 22 2.44 63 62 G 49 A
1711 chr10 84545 C T 54 9 1.69 63 30 N 135 N
1712 chr10 85074 G A 42 5 1.19 63 38 N 108 N
1713 chr10 85226 A T 42 5 1.00 63 42 N 107 N
1714 chr10 85229 C T 42 5 1.00 63 42 N 112 N
1715 chr10 87518 A G 39 4 3.25 63 38 N 9 N
1716 chr10 116402 T C 39 4 1.00 63 38 N 76 N
1717
1718
1719 =item * B<CASAVA format>
1720
1721 An example of Illumina CASAVA format is given below:
1722
1723 #position A C G T modified_call total used score reference type
1724 14930 3 0 8 0 GA 11 11 29.10:11.10 A SNP_het2
1725 14933 4 0 7 0 GA 11 11 23.50:13.70 G SNP_het1
1726 14976 3 0 8 0 GA 11 11 24.09:9.10 G SNP_het1
1727 15118 2 1 4 0 GA 8 7 10.84:6.30 A SNP_het2
1728
1729 An example of the indels is given below:
1730
1731 # ** CASAVA depth-filtered indel calls **
1732 #$ CMDLINE /illumina/pipeline/install/CASAVA_v1.7.0/libexec/CASAVA-1.7.0/filterIndelCalls.pl--meanReadDepth=2.60395068970547 --indelsCovCutoff=-1 --chrom=chr1.fa /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0000.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0001.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0002.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0003.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0004.txt
1733 #$ CHROMOSOME chr1.fa
1734 #$ MAX_DEPTH undefined
1735 #
1736 #$ COLUMNS pos CIGAR ref_upstream ref/indel ref_downstream Q(indel) max_gtype Q(max_gtype) max2_gtype bp1_reads ref_reads indel_reads other_reads repeat_unit ref_repeat_count indel_repeat_count
1737 948847 1I CCTCAGGCTT -/A ATAATAGGGC 969 hom 47 het 22 0 16 6 A 1 2
1738 978604 2D CACTGAGCCC CT/-- GTGTCCTTCC 251 hom 20 het 8 0 4 4 CT 1 0
1739 1276974 4I CCTCATGCAG ----/ACAC ACACATGCAC 838 hom 39 het 18 0 14 4 AC 2 4
1740 1289368 2D AGCCCGGGAC TG/-- GGAGCCGCGC 1376 hom 83 het 33 0 25 9 TG 1 0
1741
1742 =item * B<VCF4 format>
1743
1744 VCF4 can be used to describe both population-level variation information, or for
1745 reads derived from a single individual.
1746
1747 One example of the indel format for one individual is given below:
1748
1749 ##fileformat=VCFv4.0
1750 ##IGv2_bam_file_used=MIAPACA2.alnReAln.bam
1751 ##INFO=<ID=AC,Number=2,Type=Integer,Description="# of reads supporting consensus indel/any indel at the site">
1752 ##INFO=<ID=DP,Number=1,Type=Integer,Description="total coverage at the site">
1753 ##INFO=<ID=MM,Number=2,Type=Float,Description="average # of mismatches per consensus indel-supporting read/per reference-supporting read">
1754 ##INFO=<ID=MQ,Number=2,Type=Float,Description="average mapping quality of consensus indel-supporting reads/reference-supporting reads">
1755 ##INFO=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads">
1756 ##INFO=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads">
1757 ##INFO=<ID=SC,Number=4,Type=Integer,Description="strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads">
1758 ##IndelGenotyperV2=""
1759 ##reference=hg18.fa
1760 ##source=IndelGenotyperV2
1761 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Miapaca_trimmed_sorted.bam
1762 chr1 439 . AC A . PASS AC=5,5;DP=7;MM=7.0,3.0;MQ=23.4,1.0;NQSBQ=23.98,25.5;NQSMM=0.04,0.0;SC=2,3,0,2 GT 1/0
1763 chr1 714048 . T TCAAC . PASS AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.266666,21.932203;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT 0/1
1764 chr1 714049 . G GC . PASS AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.233334,21.83051;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT 0/1
1765 chr1 813675 . A AATAG . PASS AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=25.74,25.166666;NQSMM=0.0,0.033333335;SC=4,1,1,2 GT 0/1
1766 chr1 813687 . AGAGAGAGAGAAG A . PASS AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=24.54,25.2;NQSMM=0.02,0.06666667;SC=4,1,1,2 GT 1/0
1767
1768
1769 =back
1770
1771 The code was written by Dr. Kai Wang and modified by Dr. Germán Gastón Leparc.
1772 Various users have provided sample input files for many SNP callin software, for
1773 the development of conversion subroutines. We thank these users for their
1774 continued support to improve the functionality of the script.
1775
1776 For questions or comments, please contact kai@openbioinformatics.org.
1777
1778 =cut