annotate gfapts/inc/annovar/convert2annovar.pl @ 1:028f435b6cfb draft default tip

Uploaded
author rdaveau
date Fri, 03 Aug 2012 05:50:41 -0400
parents f753b30013e6
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1 #!/usr/bin/perl
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
2 use warnings;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
3 use strict;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
4 use Getopt::Long;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
5 use Pod::Usage;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
6
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
7 our $VERSION = '$Revision: 466 $';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
8 our $LAST_CHANGED_DATE = '$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
9
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
10 our ($verbose, $help, $man);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
11 our ($variantfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
12 our ($outfile, $format, $includeinfo, $snpqual, $snppvalue, $coverage, $maxcoverage, $chr, $chrmt, $altcov, $fraction, $species, $filterword, $confraction, $allallele);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
13
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
14 our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-');
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
15
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
16
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
17 GetOptions('verbose'=>\$verbose, 'help|h'=>\$help, 'man'=>\$man, 'outfile=s'=>\$outfile, 'format=s'=>\$format, 'includeinfo'=>\$includeinfo,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
18 'snpqual=f'=>\$snpqual, 'snppvalue=f'=>\$snppvalue, 'coverage=i'=>\$coverage, 'maxcoverage=i'=>\$maxcoverage, 'chr=s'=>\$chr, 'chrmt=s'=>\$chrmt,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
19 'fraction=f'=>\$fraction, 'altcov=i'=>\$altcov,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
20 'species'=>\$species, 'filter=s'=>\$filterword, 'confraction=f'=>\$confraction, 'allallele!'=>\$allallele) or pod2usage ();
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
21
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
22 $help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
23 $man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
24 @ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
25 @ARGV == 1 or pod2usage ("Syntax error");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
26
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
27 ($variantfile) = @ARGV;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
28
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
29 $chrmt ||= 'M';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
30
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
31 if (not $format) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
32 $format = 'pileup';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
33 print STDERR "NOTICE: the default --format argument is set as 'pileup'\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
34 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
35
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
36 if (defined $outfile) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
37 open (STDOUT, ">$outfile") or die "Error: cannot write to output file $outfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
38 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
39
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
40 defined $snpqual and $format eq 'pileup' || $format eq 'vcf4' || pod2usage ("Error in argument: the --snpqual is supported only for the 'pileup' or 'vcf4' format");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
41 defined $snppvalue and $format eq 'gff3-solid' || pod2usage ("Error in argument: the --snppvalue is supported only for the 'gff3-solid' format");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
42 if (not defined $snpqual and $format eq 'pileup') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
43 $snpqual = 20;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
44 print STDERR "NOTICE: the default --snpqual argument for pileup format is set as 20\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
45 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
46
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
47 if (not defined $snppvalue) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
48 $snppvalue = 1; #by default, do not use any of the P-value cutoff in filtering out GFF3-SOLID files (this is differnt from handling pileup files)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
49 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
50
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
51 if (not defined $coverage) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
52 $coverage = 0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
53 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
54
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
55 if (defined $fraction) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
56 $format eq 'pileup' or $format eq 'vcf4' or pod2usage ("Error in argument: the '--fraction' argument is supported for the pileup or vcf4 format only");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
57 $format eq 'vcf4' and print STDERR "NOTICE: the --fraction argument works ONLY on indels for vcf4 format\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
58 $fraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --fraction argument must be between 0 and 1 inclusive");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
59 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
60 $fraction = 0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
61 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
62
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
63 if (defined $confraction) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
64 $format eq 'vcf4' and print STDERR "NOTICE: the --confraction argument works ONLY on indels for vcf4 format\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
65 $confraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --confraction argument must be between 0 and 1 inclusive");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
66 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
67 $confraction = 0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
68 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
69
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
70 if (defined $altcov) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
71 $format eq 'pileup' or pod2usage ("Error in argument: the '--altcov' argument is supported for the '--format pileup' only");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
72 $altcov < $coverage or pod2suage ("Error in argument: the --altcov argument must be less than --coverage");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
73 $altcov > 0 or pod2suage ("Error in argument: the --altcov argument must be a positive integer");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
74 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
75
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
76 if (defined $species) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
77 $format eq 'gff3-solid' or pod2usage ("Error in argument: the '--species' argument is only necessary for the '--format gff3-solid'");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
78 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
79
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
80 if ($allallele) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
81 $format eq 'vcf4' or pod2usage ("Error in argument: the '--allallele' argument is only supported for the '--format vcf4'");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
82 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
83
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
84 if ($format eq 'pileup') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
85 convertPileup ($variantfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
86 } elsif ($format eq 'cg') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
87 convertCG ($variantfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
88 } elsif ($format eq 'gff3-solid') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
89 convertGFF3SolidSNP ($variantfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
90 } elsif ($format eq 'soap') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
91 print STDERR "WARNING: the support for '--format soap' is not well developed yet and may contain bugs for indel analysis.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
92 convertSOAP ($variantfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
93 } elsif ($format eq 'maq') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
94 print STDERR "WARNING: the support for '--format maq' is not well developed yet and may contain bugs.\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
95 convertMAQSNP ($variantfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
96 } elsif ($format eq 'casava') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
97 if (not defined $chr) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
98 pod2usage ("Error in argument: please supply --chr argument for the '--format casava'");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
99 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
100 convertCASAVA ($variantfile, $chr);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
101 } elsif ($format eq 'vcf4') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
102 convertVCF4 ($variantfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
103 } elsif ($format eq 'annovar') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
104 convertANNOVAR ($variantfile);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
105 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
106 pod2usage ("Error in argument: the --format $format is not currently supported. Please contact ANNOVAR developer for adding the support");
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
107 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
108
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
109
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
110 sub convertPileup {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
111 my ($variantfile) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
112 my ($countline, $countvar, $counthom, $counthet, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
113
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
114 if ($variantfile eq 'stdin') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
115 *VAR = *STDIN;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
116 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
117 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
118 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
119 print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, SNP quality, total reads, reads with mutation\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
120
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
121 while (<VAR>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
122 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
123 $countline++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
124 my $hom = 'hom';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
125 my @field = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
126 @field >= 10 or die "Error: invalid record found in pileupfile $variantfile (at least 10 fields expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
127 my ($chr, $pos, $wt, $call, @other) = @field;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
128 my ($cons_qual, $snp_quality, $readcount, $readallele) = @field[4,5,7,8];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
129 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
130 $wt = uc $wt; #wt may or may not in upper case, it depends on the input FASTA file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
131 $call = uc $call; #usually call is in upper case
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
132 $readallele = uc $readallele; #lower case indicate the opposite strand
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
133
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
134 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
135
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
136 $snp_quality >= $snpqual or next; #quality of the variant call
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
137 $readcount >= $coverage or next; #read count of the variant call
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
138 $maxcoverage and $readcount <= $maxcoverage || next; #maximum coverage of the variant call
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
139
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
140 if ($wt eq '*') { #indel
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
141 #example:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
142 #1 970271 * +C/+C 39 106 44 5 +C * 1 4 0 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
143 #1 1548977 * */+CCG 29 29 42 3 * +CCG 2 1 0 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
144 #1 1674810 * */+C 24 24 42 6 * +C 5 1 0 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
145 #1 968466 * -CT/-CT 53 339 55 5 -CT * 5 0 0 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
146 #1 1093600 * -GAAA/* 29 29 53 3 -GAAA * 1 2 0 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
147 #1 1110101 * */-A 41 41 17 6 * -A 5 1 0 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
148 #1 1215395 * */-TC 26 26 32 4 * -TC 3 1 0 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
149 my @obs = split (/\//, $call); #example: '+AG/+AG' as homozygotes, '*/-TA' or '*/+T' as heterozygotes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
150 @obs == 2 or die "Error: pileup record contains more than two alternative alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
151 my ($end, $ref, $alt);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
152 my ($indelreadcount); #number of reads supporting the indel
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
153
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
154
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
155 if ($obs[0] eq $obs[1]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
156 #something weird may occur in SamTools output: 22 15231121 * */* 360 0 32 158 * +GA 156 2 0 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
157 $obs[0] eq '*' and next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
158
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
159 #for deletions, SAMTOOLS represent deletion using a location before the first deleted base in the reference sequence coordinate system
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
160 #for example, a deletion in Samtools output is "1 109266688 * */-CTT 1429 1429 58 43 * -CTT 24 19 0 0 0"
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
161 #the correct ANNOVAR input (for rs35029887) should be "1 109266689 109266691 CTT - het 1429"
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
162 #insertions are fine without change; for example, check rs5745466 in Genome Browser; samtools report "1 76119508 * +AT/+AT"
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
163 #for this insertion, ANNOVAR input file (for rs5745466) becomes "1 76119508 76119508 - AT hom 1601"
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
164
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
165 if ($obs[0] =~ m/^\-/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
166 $pos++; #add 1 to position in deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
167 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
168
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
169 $indelreadcount = calculateIndelReadCount ($obs[0], \@field);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
170 $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
171 defined $altcov and $indelreadcount >= $altcov || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
172
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
173 ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
174 print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
175 $counthom++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
176 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
177 $hom = 'het';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
178 if ($obs[0] =~ m/^[\-\+]/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
179 $obs[0] =~ m/^\-/ and $pos++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
180 ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
181 $indelreadcount = calculateIndelReadCount ($obs[0], \@field);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
182 $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
183 defined $altcov and $indelreadcount >= $altcov || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
184
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
185 print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
186 $counthet++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
187 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
188 if ($obs[1] =~ m/^[\-\+]/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
189 $obs[1] =~ m/^\-/ and $pos++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
190 ($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[1]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
191 $indelreadcount = calculateIndelReadCount ($obs[1], \@field);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
192 $indelreadcount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
193 defined $altcov and $indelreadcount >= $altcov || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
194
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
195 print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
196 $counthet++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
197 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
198 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
199 $countindel++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
200 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
201 #1 798494 G A 36 36 58 3 AAA bbb
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
202 #1 798677 T K 33 33 52 26 ,$.,,G.GG,.,......,..G,,... b`bbBaJIbFbZWaTNQbb_VZcbbb
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
203 #1 856182 G A 42 42 50 5 AAAAA B\bbb
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
204 #1 861034 A M 48 48 49 14 ,$,.,..,cc.c.,c bbBbb`]BFbHbBB
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
205 #1 864289 T K 22 22 56 6 .g,,g, BH^_BB
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
206
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
207 $wt eq $call and next; #this is not a SNP
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
208 my $obs = $iupac{$call} or die "Error: invalid best call ($call) in <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
209 my @obs = split (//, $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
210 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
211 if ($obs[0] ne $obs[1]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
212 $hom = 'het';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
213 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
214
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
215
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
216 if ($obs[0] eq $wt) { #obs[0] is guaranteed to be an alternative allele
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
217 @obs = @obs[1,0];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
218 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
219 if ($wt eq 'A' and $obs[0] eq 'G' or $wt eq 'G' and $obs[0] eq 'A' or $wt eq 'C' and $obs[0] eq 'T' or $wt eq 'T' and $obs[0] eq 'C') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
220 unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
221 $countti++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
222 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
223
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
224 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
225 unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
226 $counttv++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
227 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
228 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
229
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
230 my $mutallelecount;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
231
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
232 if ($obs[1] eq $wt) { #het SNP
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
233 if ($chr eq $chrmt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
234 $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
235 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
236 $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
237 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
238 defined $altcov and $mutallelecount >= $altcov || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
239
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
240 print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
241 $counthet++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
242 } elsif ($obs[1] ne $obs[0]) { #het SNP but both differ from reference allele
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
243 if ($chr eq $chrmt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
244 $hom = calculateAllelicFraction ($obs[1], $field[8], $readcount);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
245 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
246 $mutallelecount = calculateMutAlleleCount ($obs[1], $readallele);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
247 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
248 defined $altcov and $mutallelecount >= $altcov || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
249
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
250 print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[1], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
251 if ($chr eq $chrmt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
252 $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
253 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
254 $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
255 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
256 defined $altcov and $mutallelecount >= $altcov || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
257
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
258 print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
259 $counthet++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
260 $counthet++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
261 } else { #homo SNP
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
262 if ($chr eq $chrmt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
263 $hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
264 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
265 $mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
266 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
267 defined $altcov and $mutallelecount >= $altcov || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
268
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
269 print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
270 $counthom++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
271 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
272 $countsnp++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
273 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
274 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
275 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
276 my $triallelic = $countsnp-$countti-$counttv;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
277 print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
278 print STDERR "NOTICE: Among ${\($counthet+$counthom)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
279 print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
280 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
281
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
282 sub calculateIndelReadCount {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
283 my ($obs, $field) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
284 #make sure to use upper case in the comparison, for example:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
285 #chr10 130133 * */-ca 189 189 59 31 * -ca 27 4 0 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
286 if ($obs eq uc $field->[8]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
287 return $field->[10];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
288 } elsif ($obs eq uc $field->[9]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
289 return $field->[11];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
290 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
291 die "Error: invalid record in pileup file (indel counts cannot be inferred): <$obs> vs <@$field>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
292 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
293 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
294
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
295 sub calculateMutAlleleCount {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
296 my ($allele, $string) = @_; #they should be already both in upper case
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
297 $string =~ s/\^.//g; #^ is followed by mapping quality
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
298 $string =~ s/\$//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
299 $string =~ s/[+-]1[^\d]//g; #1 followed by a non-number
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
300 $string =~ s/[+-]2..//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
301 $string =~ s/[+-]3...//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
302 $string =~ s/[+-]4....//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
303 $string =~ s/[+-]5.....//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
304 $string =~ s/[+-]6......//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
305 $string =~ s/[+-]7.......//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
306 $string =~ s/[+-]8........//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
307 $string =~ s/[+-]9.........//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
308 $string =~ s/[+-]10..........//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
309
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
310 #make sure to use upper case letter
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
311 my @string = split (//, uc $string);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
312 my $count = 0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
313 for my $i (0 .. @string-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
314 $allele eq $string[$i] and $count++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
315 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
316 return $count;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
317 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
318
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
319 sub calculateAllelicFraction {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
320 my ($obs, $readbase, $readcount) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
321 my @readbase = split (//, $readbase);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
322 my $count=0;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
323 for my $i (0 .. @readbase-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
324 uc $obs eq uc $readbase[$i] and $count++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
325 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
326 my $hom = $count/$readcount;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
327 length ($hom) > 5 and $hom > 0.001 and $hom = sprintf ("%.3f", $hom);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
328 return $hom;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
329 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
330
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
331 sub recalculateEndRefObs { #recalculate end position, reference allele and observed allele
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
332 my ($end, $ref, $obs) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
333 if ($obs =~ m/^\-(\w+)/) { #deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
334 $end += (length ($1)-1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
335 $ref = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
336 $obs = '-';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
337 } elsif ($obs =~ m/^\+(\w+)/) { #insertion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
338 $ref = '-';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
339 $obs = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
340 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
341 die "Error: cannot handle $end, $ref, $obs\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
342 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
343 return ($end, $ref, $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
344 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
345
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
346 sub convertCG {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
347 my ($variantfile) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
348
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
349 my ($foundheader, $countline, @field);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
350 my ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
351 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
352 print STDERR "NOTICE: Converting variants from $variantfile\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
353 while (<VAR>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
354 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
355 $countline++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
356 if (m/^>locus/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
357 $foundheader++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
358 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
359 if (not $foundheader) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
360 $countline > 50 and die "Error: invalid CG-var file format for $variantfile (>locus record is not found within the first 50 lines)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
361 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
362 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
363 my ($locus, $ploidy, $haplo, $chr, $start, $end, $vartype, $ref, $obs, $score, $haplolink, $xref) = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
364 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
365 $vartype eq 'ins' or $start++; #CG use zero-start, half-open coordinate. Insertion does not need to be processed (example, "16476 2 2 chr1 751820 751820 ins T 49 dbsnp:rs59038458")
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
366 $obs eq '' and $obs = '-';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
367 $ref eq '' and $ref = '-';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
368
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
369 if ($vartype =~ m/^snp|ins|del|delins|sub$/) { #in new versions of the files, "sub" is used instead of "delins".
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
370 #$chr eq 'M' and next; #ignore chrM markers as they are not diploid
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
371 if ($chr eq $prechr and $start eq $prestart and $end eq $preend and $obs eq $preobs) { #homozygous mutation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
372 print $chr, "\t", $start, "\t", $end, "\t", $ref, "\t", $obs, "\t", $vartype, "\t", ($score+$prescore)/2, "\t", "hom\t", $xref, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
373 ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
374 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
375 if ($prestart and $preend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
376 print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
377 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
378 ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = ($chr, $start, $end, $vartype, $ref, $obs, $score, $xref);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
379 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
380 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
381 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
382 if ($prestart and $preend) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
383 print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
384 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
385 print STDERR "NOTICE: Done with $countline lines\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
386 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
387
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
388 sub convertGFF3SolidSNP {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
389 my ($variantfile) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
390 my ($countline, $countvar, $countallvar, @other) = (0, 0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
391 my ($unknown_count); #count of variants with 'unknown' variation type
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
392
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
393 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
394 $_ = <VAR>;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
395 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
396 m/^##gff-version\s+3/ or die "Error: invalid first line in GFF3 file ('##gff-version 3' expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
397 $_ = <VAR>;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
398 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
399 m/^##solid-gff-version/ or print STDERR "WARNING: problematic second line in GFF3-SOLiD file ('##solid-gff-version' expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
400
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
401 print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, variant score (P-value), total clipped normal coverage reads, total reads with mutated allele\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
402
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
403 while (<VAR>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
404 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
405 $countline++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
406 m/^##/ and next; #header of comment lines
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
407 m/^#/ and next; #header of results lines
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
408
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
409 my @field = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
410 @field == 9 or die "Error: invalid record found in $variantfile (10 fields expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
411 my ($chr, $program, $type, $pos, $end, $score, $attribute) = @field[0,1,2,3,4,5,8]; #score is the P-value for the SNP calls
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
412 $chr eq 'chr_name' and next; #header line
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
413
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
414 if ($score ne '.') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
415 $score >=0 and $score <=1 or die "Error: invalid score record found in file (0-1 range expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
416 $score <= $snppvalue or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
417 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
418
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
419 if ($species and $species eq 'human') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
420 $chr eq '23' and $chr = 'X';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
421 $chr eq '24' and $chr = 'Y';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
422 $chr eq '25' and $chr = 'M';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
423 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
424
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
425 $includeinfo and @other = ($attribute); #unless -includeinfo is set, the other will not be printed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
426
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
427 my ($readcount, $mutallelecount) = ('.', '.'); #total coverage, coverage for mutated alleles
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
428
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
429 if ($type eq 'unknown') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
430 #SOLiD GDD3 may have unknown variation types
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
431 #chr1 AB_SOLiD Small Indel Tool unknown 3833062 3833153 1 . . ID=5483;len=no_call;allele-call-pos=3833062;allele-call=/CCAC;allele-pos=3833057;alleles=atccatccacccatc/aTCCATCCACCCACCCATC/NO_CALL;allele-counts=REF,2,2;tight_chrom_pos=none;loose_chrom_pos=3833058-3833069;no_nonred_reads=3;coverage_ratio=8.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_50_10_r,L1_1_50_15_r,L1_1_50_15_r,L1_1_50_12_r;bead_ids=1018_196_970,699_1263_465,220_513_923,2022_1532_1071;overall_qvs=4,6,2,50;no_mismatches=5,4,2,0;read_pos=27,29,31,13;from_end_pos=23,21,19,37;strands=+,+,+,+;tags=R3,F3,F3,F3;indel_sizes=-92,-112,4,4;non_indel_no_mismatches=0,0,8,0;unmatched-lengths=50,50,50,50;ave-unmatched=50.0000;anchor-match-lengths=48,49,49,49;ave-anchor-length=48.7500;read_seqs=G23223321322112233223100132013201320110011001322332,T33223321322112233223100132013201320110013021322332,T33223321322112233223100132013201320110011001322332,T31001320132013201100110013223322113030332233113032;base_qvs=;non_indel_seqs=T21322332211221121322332230321212121223322332233221,G12020202202020012001200213022002130012332310122030,G12020202202020012001000210022012110312331331122030,G22111012101031010100002002321020002202121121313021;non_indel_qvs=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
432 $unknown_count++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
433 next; #do not count this one!
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
434 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
435
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
436 if ($program eq 'SOLiD_diBayes' or $program eq 'AB_SOLiD SNP caller') { #SNP variants
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
437 #detailed description can be found at http://solidsoftwaretools.com/gf/download/docmanfileversion/208/866/DiBayes_Documentation_v1.2.pdf
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
438 #chr1 SOLiD_diBayes SNP 559817 559817 0.094413 . . genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
439 #chr1 SOLiD_diBayes SNP 714068 714068 0.000000 . . genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
440 #chr1 SOLiD_diBayes SNP 714835 714835 0.041579 . . genotype=R;reference=A;coverage=5;refAlleleCounts=3;refAlleleStarts=3;refAlleleMeanQV=18;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=20;diColor1=02;diColor2=20;het=1;flag=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
441
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
442 $pos == $end or die "Error: invalid record found in GFF3-SOLiD file: start and end discordant: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
443
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
444 my ($wt, $call);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
445
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
446 if ($attribute =~ m/ref_base=(\w)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
447 $wt = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
448 } elsif ($attribute =~ m/reference=(\w)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
449 $wt = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
450 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
451 die "Error: invalid record found in GFF3-SOLiD file (ref_base/reference was not found): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
452 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
453
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
454 if ($attribute =~ m/consen_base=(\w)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
455 $call = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
456 } elsif ($attribute =~ m/genotype=(\w)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
457 $call = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
458 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
459 die "Error: invalid record found in GFF3-SOLiD file (consen_base was not found): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
460 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
461
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
462 if ($attribute =~ m/coverage=(\d+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
463 $readcount = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
464 $readcount >= $coverage or next; #read count of the variant call
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
465 $maxcoverage and $readcount <= $maxcoverage || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
466 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
467 if ($attribute =~ m/novelAlleleCounts=(\d+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
468 $mutallelecount = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
469 $mutallelecount/$readcount >= $fraction or next; #do not meet minimum alternative allele fraction threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
470 defined $altcov and $mutallelecount >= $altcov || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
471 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
472
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
473 my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
474 my @obs = split (//, $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
475 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
476 if ($obs[0] eq $wt and $obs[1] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
477 die "Error: reference alleles are identical to observed alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
478 } elsif ($obs[0] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
479 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
480 } elsif ($obs[1] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
481 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
482 } elsif ($obs[1] ne $obs[0]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
483 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
484 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
485 $countallvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
486 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
487 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
488 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
489 } elsif ($program eq 'AB_CNV_PIPELINE') { #CNV
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
490 if ($attribute =~ m/copynum=(\d+)/ or $attribute =~ m/copynumber=(\d+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
491 if ($1 < 2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
492 print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
493 } elsif ($1 > 2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
494 print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
495 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
496 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
497 print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
498 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
499 } elsif ($program eq 'AB_SOLiD Large Indel Tool') { #CNV
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
500 #http://solidsoftwaretools.com/gf/download/docmanfileversion/182/780/Large_Indel_Documentation_v1.0.0.pdf
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
501 ## [FIELDS] (1) chromosome (2) version (3) indel type (4) breakpoint start (5) breakpoint end (6) p-value (7) NA (8) NA (9) attributes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
502 #chr10 AB_SOLiD Large Indel Tool insertion 151910 151969 2.77548e-11 . . dev=-71;avgDev=-1.63884;zygosity=HOMOZYGOUS;nRef=0;nDev=14;refDev=0;devDev=-1.60924;refVar=0;devVar=0.0159438;beadIds=1750_720_1641,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
503 #chr10 AB_SOLiD Large Indel Tool insertion 154109 154729 2.1559e-11 . . dev=-66;avgDev=-1.51253;zygosity=HOMOZYGOUS;nRef=0;nDev=15;refDev=0;devDev=-1.02864;refVar=0;devVar=0.133236;beadIds=1728_1671_1739,1250_231_25,811_783_1090,1035_908_491,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
504 my ($call, @call, $zygosity);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
505 if ($attribute =~ m#zygosity=HEMIZYGOUS#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
506 $zygosity = 'het';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
507 } elsif ($attribute =~ m#zygosity=HOMOZYGOUS#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
508 $zygosity = 'hom';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
509 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
510 $zygosity = 'unk';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
511 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
512 if ($type eq 'insertion') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
513 #the true boundary is unknown (start is different from end) so we cannot use "-" to represent reference allele.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
514 print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", 0, "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
515 } elsif ($type eq 'deletion') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
516 print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
517 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
518 } elsif ($program eq 'AB_SOLiD Small Indel Tool') { #small indels
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
519 #typical simple insertion and deletions
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
520 #chr1 AB_SOLiD Small Indel Tool deletion 1352612 1352614 1 . . ID=1290;del_len=3;allele-call-pos=1352612;allele-call=cct/;allele-pos=1352610;alleles=cccctccat/cCCCAT;allele-counts=REF,2;tight_chrom_pos=1352612-1352614;loose_chrom_pos=1352612-1352614;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_3_r,L1_1_25_8_r;bead_ids=1470_2000_506,822_1710_1767;overall_qvs=18,19;no_mismatches=3,3;read_pos=6,13;from_end_pos=19,12;strands=-,+;tags=R3,R3;indel_sizes=-3,-3;non_indel_no_mismatches=1,-1;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=24,99;ave-anchor-length=61.5000;read_seqs=G0112310001100003120031200,G0300213000011000132110021;base_qvs=;non_indel_seqs=T2120033002022200220000002,;non_indel_qvs=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
521 #chr1 AB_SOLiD Small Indel Tool insertion_site 1311162 1311162 1 . . ID=1249;ins_len=1;allele-call-pos=1311162;allele-call=/G;allele-pos=1311161;alleles=gaggggggg/GAGGGGGGGG/NO_CALL;allele-counts=REF,3,1;tight_chrom_pos=none;loose_chrom_pos=1311160-1311169;no_nonred_reads=3;coverage_ratio=4.6667;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_50_10_r,L1_1_25_2_r,L1_1_25_3_r;bead_ids=850_837_429,1160_878_181,404_1050_1881,1084_64_1343;overall_qvs=20,56,25,25;no_mismatches=3,2,2,1;read_pos=11,22,11,11;from_end_pos=14,28,14,14;strands=+,-,-,-;tags=R3,F3,F3,F3;indel_sizes=1,1,1,1;non_indel_no_mismatches=-1,1,0,1;unmatched-lengths=25,50,25,25;ave-unmatched=31.2500;anchor-match-lengths=99,49,24,24;ave-anchor-length=49.0000;read_seqs=G1020001130221020000000020,T03223323210110021000000022122030100020221222222122,T0102210000000221223301000,T0102210000000221220301000;base_qvs=;non_indel_seqs=,G21202030032202013220021321131212021000122300013132,G1331133120001221220120120,G1331133120001221220120220;non_indel_qvs=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
522
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
523 #sometimes, allele-call is ambiguous that requires a "block substitution" representation (although they were annotated as insertion or deletion by SOLiD, they should be treated as block substitution by ANNOVAR)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
524 #sometimes, mutiple allele calls may be present at the same site
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
525 #chr1 AB_SOLiD Small Indel Tool deletion 1209357 1209360 1 . . ID=1101;del_len=4;allele-call-pos=1209357;allele-call=ggtggg/TT;allele-pos=1209355;alleles=ggggtgggggggtt/gGTTGGGGTT/gGTGTTTTGCCTT/NO_CALL;allele-counts=REF,3,1,1;tight_chrom_pos=none;loose_chrom_pos=1209357-1209363;no_nonred_reads=4;coverage_ratio=3.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=0.9888;run_names=L1_1_25_1_r,L1_1_25_2_r,L1_1_25_4_r,L1_1_25_3_r,L1_1_25_7_r;bead_ids=1017_1024_53,1493_1896_615,1794_647_1473,307_1116_687,773_1492_1671;overall_qvs=24,24,28,24,8;no_mismatches=2,3,2,3,2;read_pos=14,9,14,9,15;from_end_pos=11,16,11,16,10;strands=-,+,-,+,+;tags=F3,R3,F3,F3,F3;indel_sizes=-4,-4,-4,-4,3;non_indel_no_mismatches=1,0,0,0,0;unmatched-lengths=25,25,25,25,25;ave-unmatched=25.0000;anchor-match-lengths=24,24,24,24,24;ave-anchor-length=24.0000;read_seqs=T2221100101000101000221100,G0001122000100000101001020,T2221100101000101000221100,T1112200010100010100112000,T1011220000111000130200001;base_qvs=;non_indel_seqs=G0312033221312111022200300,T0111113210210112100001130,G0312133221312111022200300,G0231003132222112000012020,G3121331033101113122312020;non_indel_qvs=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
526 #chr1 AB_SOLiD Small Indel Tool deletion 1209436 1209436 1 . . ID=1103;del_len=1;allele-call-pos=1209436;allele-call=ag/A/G;allele-pos=1209434;alleles=tgagggggtt/tGAGGGGTT/tGGGGGGTT;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1209436-1209441;no_nonred_reads=2;coverage_ratio=5.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_25_2_r;bead_ids=1315_1584_2005,1706_194_437;overall_qvs=28,21;no_mismatches=0,3;read_pos=9,7;from_end_pos=16,18;strands=-,-;tags=R3,R3;indel_sizes=-1,-1;non_indel_no_mismatches=-1,0;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=99,24;ave-anchor-length=61.5000;read_seqs=G3001010000011001010000001,G3010100022110010111000110;base_qvs=;non_indel_seqs=,T1112003220020013202122300;non_indel_qvs=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
527 #chr1 AB_SOLiD Small Indel Tool insertion_site 1424540 1424540 1 . . ID=1376;ins_len=3;allele-call-pos=1424540;allele-call=tt/CCCAC;allele-pos=1424537;alleles=ttttttg/TTTCCCACTG/NO_CALL;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1424536-1424543;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_7_r,L1_1_50_16_r;bead_ids=703_437_370,1089_878_1744;overall_qvs=1,9;no_mismatches=3,4;read_pos=5,35;from_end_pos=20,15;strands=-,-;tags=R3,F3;indel_sizes=3,3;non_indel_no_mismatches=2,0;unmatched-lengths=25,50;ave-unmatched=37.5000;anchor-match-lengths=24,47;ave-anchor-length=35.5000;read_seqs=G2032002200200000000000020,T30100102220312202103112130230322210121100200002100;base_qvs=;non_indel_seqs=T2121120003012303000000000,G22213300221101011121030022002222300220322213303102;non_indel_qvs=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
528 my ($call, @call, $zygosity);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
529 if ($attribute =~ m#experimental-zygosity=HEMIZYGOUS#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
530 $zygosity = 'het';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
531 } elsif ($attribute =~ m#experimental-zygosity=HOMOZYGOUS#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
532 $zygosity = 'hom';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
533 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
534 $zygosity = 'unk';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
535 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
536 $score = '.'; #by default, score=1 in the output
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
537
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
538 #no_nonred_reads: Number of reads with unique start positions (non-redundant reads).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
539 #coverage_ratio: Clipped normal coverage/number of non-redundant reads.Clipped coverage is where the parts of the read that are within 5 bp at either end are not counted as a part of coverage.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
540 if ($attribute =~ m/no_nonred_reads=(\d+);coverage_ratio=([\d\.]+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
541 $readcount = int ($1*$2);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
542 $readcount >= $coverage or next; #clipped normal read count of the variant call (this could even be lower than mut allele count)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
543 $maxcoverage and $readcount <= $maxcoverage || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
544 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
545 $readcount = '.';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
546 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
547 if ($attribute =~ m/allele-counts=REF,(\d+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
548 $mutallelecount = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
549 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
550
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
551 if ($attribute =~ m#allele\-call=([\w\/]+)#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
552 @call = split (/\//, $1);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
553
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
554 if (@call == 1) { #a simple deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
555 print $chr, "\t", $pos, "\t", $end, "\t", $call[0], "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
556 } elsif ($call[0] eq '') { #a simple insertion (single or multiple allele)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
557 for my $i (1 .. @call-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
558 print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
559 $i > 1 and $countallvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
560 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
561 } else { #an indel that may have several alleles, or may require a block substitution representation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
562 for my $i (1 .. @call-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
563 print $chr, "\t", $pos, "\t", $pos+length($call[0])-1, "\t", $call[0], "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
564 $i > 1 and $countallvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
565 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
566 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
567 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
568 $call = '0';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
569 print $chr, "\t", $pos, "\t", $end, "\t", $call, "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
570 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
571 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
572 die "Error: unrecognizable genotype calling program encountered (valid types are SOLiD_diBayes, AB_CNV_PIPELINE, AB_SOLiD Large Indel Tool, AB_SOLiD Small Indel Tool): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
573 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
574
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
575 $countvar++; #variation positions
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
576 $countallvar++; #all variants (maybe several at one variation position)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
577 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
578 print STDERR "NOTICE: Finished processing $variantfile with $countline input lines\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
579 print STDERR "NOTICE: Wrote variants in $countvar variation positions ($countallvar variants considering multi-allelic ones)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
580 $unknown_count and print STDERR "WARNING: $unknown_count variants with 'unknown' variation type were skipped\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
581 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
582
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
583
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
584
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
585 sub convertSOAP {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
586 my ($variantfile) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
587 my ($countline, $countvar, @other);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
588
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
589 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
590
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
591 while (<VAR>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
592 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
593 $countline++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
594
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
595 my @field = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
596 if (@field == 18) { #snp file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
597 my ($chr, $pos, $wt, $call, @other) = @field;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
598 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
599
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
600 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
601
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
602 my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
603 my @obs = split (//, $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
604 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
605 if ($obs[0] eq $wt and $obs[1] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
606 die "Error: reference alleles are identical to observed alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
607 } elsif ($obs[0] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
608 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
609 } elsif ($obs[1] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
610 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
611 } elsif ($obs[1] ne $obs[0]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
612 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
613 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
614 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
615 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
616 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
617 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
618 } elsif (@field == 6) { #indel file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
619 my ($chr, $pos, $strand, $indellen, $call, $homo) = @field;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
620 $homo eq 'Homo' and $homo = 'hom';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
621 $homo eq 'Hete' and $homo = 'het';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
622 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
623
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
624 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
625
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
626 if ($indellen =~ m/^\+(\d+)$/) { #insertion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
627 length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
628 print join("\t", $chr, $pos, $pos, '-', $call, $homo), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
629 } elsif ($indellen =~ m/^\-(\d+)$/) { #deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
630 length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
631 print join("\t", $chr, $pos, $pos+$1-1, $call, '-', $homo), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
632 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
633 die "Error: invalid record found in SOAPindel file: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
634 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
635 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
636 die "Error: invalid record found in $variantfile (18 or 6 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
637 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
638 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
639 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
640 print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
641 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
642
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
643 sub convertANNOVAR {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
644 my ($variantfile) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
645 my ($countline, $countvar, $countsnp);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
646 my ($countti, $counttv) = (0, 0);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
647
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
648 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
649
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
650 while (<VAR>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
651 $countline++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
652
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
653 my @field = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
654 my ($chr, $start, $end, $ref, $obs) = @field;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
655 if ($ref =~ m/^[ACGT]$/ and $obs =~ m/^[ACGT]$/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
656 if ($ref eq 'A' and $obs eq 'G' or $ref eq 'G' or $obs eq 'A' or $ref eq 'C' and $obs eq 'T' or $ref eq 'T' and $obs eq 'C') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
657 $countti++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
658 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
659 $counttv++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
660 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
661 $countsnp++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
662 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
663
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
664 print;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
665 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
666 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
667 print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
668 print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
669 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
670
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
671 sub convertMAQSNP {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
672 my ($variantfile) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
673 my ($countline, $countvar, @other);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
674
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
675 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
676
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
677 while (<VAR>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
678 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
679 $countline++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
680
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
681 my @field = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
682 if (@field == 12) { #SNPs
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
683 my ($chr, $pos, $wt, $call, @other) = @field;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
684 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
685
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
686 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
687
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
688 my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
689 my @obs = split (//, $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
690 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
691 if ($obs[0] eq $wt and $obs[1] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
692 die "Error: reference alleles are identical to observed alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
693 } elsif ($obs[0] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
694 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
695 } elsif ($obs[1] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
696 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
697 } elsif ($obs[1] ne $obs[0]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
698 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
699 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
700 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
701 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
702 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
703 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
704 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
705 } elsif (@field == 13) { #indels; the deletion start site do not need changes; the duplication start site need additional investigation by ANNOVAR developers
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
706 my ($chr, $pos, $type, $numread, $call, @other) = @field;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
707 $chr =~ s/^chr//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
708
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
709 $includeinfo or @other = (); #unless -includeinfo is set, the other will not be printed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
710
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
711 my @obs = split (/:/, $call);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
712 @obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
713 if ($obs[0] =~ m/^\-(\d+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
714 my $len = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
715 print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[1], "\t", '-', "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
716 } elsif ($obs[0] =~ m/^(\d+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
717 my $len = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
718 print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
719 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
720 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
721 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
722 die "Error: invalid record found in $variantfile (12 or 13 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
723 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
724 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
725 print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
726 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
727
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
728 sub convertCASAVA {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
729 my ($variantfile, $chr) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
730 my ($countline, $countvar, @other);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
731
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
732 my ($intype);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
733 my ($pos_index, $call_index, $reference_index, $type_index, $score_index, $total_index, $used_index);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
734 my ($ref_indel_index, $quality_index, $maxgtype_index, $bp1_reads_index, $ref_reads_index, $indel_reads_index, $other_reads_index);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
735 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
736
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
737 while (<VAR>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
738 s/[\r\n]+$//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
739 $countline++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
740 my @field;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
741
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
742 if (m/^#/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
743 s/^#//;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
744 if (s/^\$\sCOLUMNS\s//) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
745 @field = split (/\s+/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
746 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
747 @field = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
748 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
749 if (m/\bposition\b/ or m/\bpos\b/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
750 for my $i (0 .. @field-1) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
751 if ($field[$i] eq 'position' or $field[$i] eq 'pos') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
752 $pos_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
753 } elsif ($field[$i] eq 'modified_call') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
754 $intype = 'snp';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
755 print STDERR "NOTICE: Automatically detected input type as $intype\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
756 $call_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
757 } elsif ($field[$i] eq 'reference') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
758 $reference_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
759 } elsif ($field[$i] eq 'type') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
760 $type_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
761 } elsif ($field[$i] eq 'score') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
762 $score_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
763 } elsif ($field[$i] eq 'total') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
764 $total_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
765 } elsif ($field[$i] eq 'used') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
766 $used_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
767 } elsif ($field[$i] eq 'ref/indel') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
768 $intype = 'indel';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
769 print STDERR "NOTICE: Automatically detected input type as $intype\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
770 $ref_indel_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
771 } elsif ($field[$i] eq 'Q(indel)') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
772 $quality_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
773 } elsif ($field[$i] eq 'max_gtype') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
774 $maxgtype_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
775 } elsif ($field[$i] eq 'bp1_reads') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
776 $bp1_reads_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
777 } elsif ($field[$i] eq 'ref_reads') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
778 $ref_reads_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
779 } elsif ($field[$i] eq 'indel_reads') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
780 $indel_reads_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
781 } elsif ($field[$i] eq 'other_reads') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
782 $other_reads_index = $i;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
783 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
784 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
785 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
786 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
787 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
788
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
789 $intype or die "Error: unable to recognize the correct type of the input file (make sure that header line is present in $variantfile)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
790 @field = split (/\t/, $_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
791
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
792 if ($intype eq 'snp') { #SNPs
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
793 defined $pos_index and defined $reference_index and defined $call_index or die "Error: unalbe to find the position, reference and modified_call column header in $variantfile\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
794 my ($pos, $wt, $obs) = @field[$pos_index, $reference_index, $call_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
795 my (@other);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
796 defined $pos and defined $wt and defined $obs or die;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
797 if ($includeinfo) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
798 defined $score_index and push @other, $field[$score_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
799 defined $total_index and push @other, $field[$total_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
800 defined $used_index and push @other, $field[$used_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
801 defined $type_index and push @other, $field[$type_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
802 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
803
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
804 length ($obs) == 1 and $obs .= $obs;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
805 my @obs = split (//, $obs);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
806 @obs == 2 or die "Error: observed allele $obs should correspond to two nucleotide alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
807 if ($obs[0] eq $wt and $obs[1] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
808 die "Error: reference alleles are identical to observed alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
809 } elsif ($obs[0] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
810 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
811 } elsif ($obs[1] eq $wt) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
812 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
813 } elsif ($obs[1] ne $obs[0]) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
814 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
815 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
816 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
817 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
818 print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
819 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
820 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
821 } elsif ($intype eq 'indel') { #indels
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
822 defined $pos_index and defined $ref_indel_index and defined $maxgtype_index or die "Error: unable to find the pos, ref_indel and max_gtype column header in $variantfile\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
823 my ($pos, $call, $hom, @other) = @field[$pos_index, $ref_indel_index, $maxgtype_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
824 if ($includeinfo) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
825 defined $quality_index and push @other, $field[$quality_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
826 defined $maxgtype_index and push @other, $field[$maxgtype_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
827 defined $bp1_reads_index and push @other, $field[$bp1_reads_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
828 defined $ref_reads_index and push @other, $field[$ref_reads_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
829 defined $indel_reads_index and push @other, $field[$indel_reads_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
830 defined $other_reads_index and push @other, $field[$other_reads_index];
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
831 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
832
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
833 #hg19 coordinate below; insertion needs position adjustment!!! deletion is fine
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
834 #948847 1I CCTCAGGCTT -/A ATAATAGGGC 969 hom 47 het 22 0 16 6 A 1 2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
835 #978604 2D CACTGAGCCC CT/-- GTGTCCTTCC 251 hom 20 het 8 0 4 4 CT 1 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
836 #1276974 4I CCTCATGCAG ----/ACAC ACACATGCAC 838 hom 39 het 18 0 14 4 AC 2 4
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
837 #1289368 2D AGCCCGGGAC TG/-- GGAGCCGCGC 1376 hom 83 het 33 0 25 9 TG 1 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
838 #185137455 11I10M2I TATGTGTCCT -----------TTTTTTATTT--/AAATGATAGACTTTTTTTTTTAA ATTTCAGAAA 1126 het 988 hom 45 20 24 7 N/A 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
839 #1276931 2D41M4I CACACACATG CACACACACGCACACACGTGCAATGTGAAAACACCTCATGCAG----/--CACACACGCACACACGTGCAATGTGAAAACACCTCATGCAGACAC ACACATGCAC 548 hom 16 het 8 0 11 11 N/A 0 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
840
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
841 my @obs = split (/\//, $call);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
842 @obs == 2 or die "Error: observed indel allele $call should correspond to two alleles: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
843 if ($obs[0] =~ m/^\-+$/) { #insertion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
844 my $len = length ($obs[0]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
845 print $chr, "\t", $pos-1, "\t", $pos-1, "\t", '-', "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
846 } elsif ($obs[1] =~ m/^\-+$/) { #deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
847 my $len = length ($obs[0]);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
848 print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[0], "\t", '-', "\t", $hom, "\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
849 } elsif (length ($obs[0]) eq length ($obs[1])) { #block substitution
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
850 $obs[0] =~ s/\-//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
851 $obs[1] =~ s/\-//g;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
852 print $chr, "\t", $pos, "\t", $pos+length($obs[0])-1, "\t", $obs[0], "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
853 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
854 die "Error: invalid record found in indel line: <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
855 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
856 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
857 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
858 die "Error: invalid record found in $variantfile (11 or 15 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
859 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
860 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
861 print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
862 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
863
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
864 sub convertVCF4 {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
865 my ($variantfile) = @_;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
866
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
867 my ($countline, $countvar, $counthom, $counthet, $countunknown, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0 0/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
868
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
869 my ($source_program); #the program that generated the VCF4 file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
870 open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
871
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
872 while (<VAR>) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
873 $countline++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
874
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
875 if (m/^##fileformat=VCFv(\d+\.)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
876 $1<4 and print STDERR "ERROR: Input file is not in VCF version 4 format but is $_" and exit;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
877 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
878 if (m/^##UnifiedGenotyper/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
879 $source_program = 'gatksnp';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
880 print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK UnifiedGenotyper\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
881 print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, RMS mapping quality, quality by depth\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
882 $fraction and print STDERR "WARNING: the --fraction argument will be ignored for GATK SNP calls!!!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
883 $confraction and print STDERR "WARNING: the --confraction argument will be ignored for GATK SNP calls!!!\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
884 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
885 if (m/^##IndelGenotyper/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
886 $source_program = 'gatkindel';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
887 print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK IndelGenotyper\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
888 print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, read count supporting indel call, RMS mapping quality\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
889 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
890
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
891 m/^#/ and next; #skip comment lines
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
892 s/[\r\n]+$//; #delete trailing new lines
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
893 my $otherinfo = $_; #this is the complete line (when -includeinfo is set, the entire line will be included in output file)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
894
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
895 #format description: http://www.1000genomes.org/wiki/Analysis/vcf4.0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
896 #standard VCF4 should have 8 columns, but some software may produce more columns (for example, for genotype calls). The first 8 columns should follow the specification
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
897
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
898 #example of VCF4 generated by GATK SNP caller
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
899 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
900 #1 55 . T G 34.82 . DP=2;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=14.16;MQ0=0;QD=17.41;SB=-10.00 GT:DP:GL:GQ 0/1:1:-6.66,-0.30,-0.00:1.76
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
901 #1 2646 . G A 40.91 . DP=4;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=7.50;MQ0=3;QD=10.23;SB=-10.00 GT:DP:GL:GQ 0/1:1:-7.27,-0.30,-0.00:1.76
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
902
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
903 #example of VCF4 generated by GATK indel caller
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
904 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
905 #1 2525324 . G GC . PASS AC=5,5;DP=12;MM=4.8,3.7142856;MQ=29.0,42.285713;NQSBQ=33.0,46.463768;NQSMM=0.24,0.20289855;SC=0,5,1,6 GT 0/1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
906 #1 3553372 . GC G . PASS AC=6,6;DP=6;MM=0.8333333,0.0;MQ=60.0,0.0;NQSBQ=63.533333,0.0;NQSMM=0.0,0.0;SC=0,6,0,0 GT 1/0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
907 #1 6093011 . CG C . PASS AC=31,31;DP=32;MM=0.7096774,2.0;MQ=59.64516,60.0;NQSBQ=64.192184,39.666668;NQSMM=0.0,0.11111111;SC=23,8,0,1 GT 1/0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
908
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
909 #example of VCF4 generated by 1000G
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
910 #CHROM POS ID REF ALT QUAL FILTER INFO
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
911 #1 533 . G C . PASS AA=.;AC=6;AN=120;DP=423
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
912 #1 41342 . T A . PASS AA=.;AC=29;AN=120;DP=188
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
913 #1 41791 . G A . PASS AA=.;AC=5;AN=120;DP=192
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
914 #1 44449 . T C . PASS AA=C;AC=2;AN=120;DP=166
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
915 #1 44539 rs2462492 C T . PASS AA=T;AC=2;AN=120;DP=131
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
916
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
917 #example of VCF4 generated by 1000G
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
918 #CHROM POS ID REF ALT QUAL FILTER INFO
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
919 #1 1000153 . TCACA T 100 PASS AF=0.115095;HP=1;NF=16;NR=13;NS=52;CA=0;DP=615
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
920 #1 1000906 . CA C 48 PASS AF=0.0772696;HP=1;NF=2;NR=9;NS=51;CA=0;DP=281
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
921 #1 1000950 rs60561655;-/G CG C 100 PASS AF=0.447771;HP=5;DB;NF=10;NR=20;NS=50;CA=M;DP=291
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
922 #1 1010786 rs36095298;-/G,mills,venter A AG 100 PASS AF=0.774334;HP=1;DB;NF=21;NR=27;NS=51;CA=0;DP=306
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
923 #1 1026158 . T TGGGGG 100 PASS AF=0.115637;HP=1;NF=5;NR=2;NS=52;CA=0;DP=591
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
924
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
925 #reserved VCF4 sub-fields in the INFO field
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
926 # * AA ancestral allele
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
927 # * AC allele count in genotypes, for each ALT allele, in the same order as listed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
928 # * AF allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
929 # * AN total number of alleles in called genotypes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
930 # * BQ RMS base quality at this position
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
931 # * CIGAR cigar string describing how to align an alternate allele to the reference allele
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
932 # * DB dbSNP membership
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
933 # * DP combined depth across samples, e.g. DP=154
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
934 # * END end position of the variant described in this record (esp. for CNVs)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
935 # * H2 membership in hapmap2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
936 # * MQ RMS mapping quality, e.g. MQ=52
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
937 # * MQ0 Number of MAPQ == 0 reads covering this record
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
938 # * NS Number of samples with data
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
939 # * SB strand bias at this position
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
940 # * SOMATIC indicates that the record is a somatic mutation, for cancer genomics
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
941 # * VALIDATED validated by follow-up experiment
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
942
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
943
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
944 #SAMtools/BCFtools specific information
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
945 #SAMtools/BCFtools may write the following tags in the INFO field in VCF/BCF.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
946 #Tag Description
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
947 #I16 16 integers:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
948 #1 #reference Q13 bases on the forward strand 2 #reference Q13 bases on the reverse strand
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
949 #3 #non-ref Q13 bases on the forward strand 4 #non-ref Q13 bases on the reverse strand
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
950 #5 sum of reference base qualities 6 sum of squares of reference base qualities
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
951 #7 sum of non-ref base qualities 8 sum of squares of non-ref base qualities
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
952 #9 sum of ref mapping qualities 10 sum of squares of ref mapping qualities
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
953 #11 sum of non-ref mapping qualities 12 sum of squares of non-ref mapping qualities
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
954 #13 sum of tail distance for ref bases 14 sum of squares of tail distance for ref bases
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
955 #15 sum of tail distance for non-ref bases 16 sum of squares of tail distance for non-ref
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
956 #INDEL Indicating the variant is an INDEL.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
957 #DP The number of reads covering or bridging POS.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
958 #DP4 Number of 1) forward ref alleles; 2) reverse ref; 3) forward non-ref; 4) reverse non-ref alleles, used in variant calling. Sum can be smaller than DP because low-quality bases are not counted.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
959 #PV4 P-values for 1) strand bias (exact test); 2) baseQ bias (t-test); 3) mapQ bias (t); 4) tail distance bias (t)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
960 #FQ Consensus quality. If positive, FQ equals the phred-scaled probability of there being two or more different alleles. If negative, FQ equals the minus phred-scaled probability of all chromosomes being identical. Notably, given one sample, FQ is positive at hets and negative at homs.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
961 #AF1 EM estimate of the site allele frequency of the strongest non-reference allele.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
962 #CI95 Equal-tail (Bayesian) credible interval of the site allele frequency at the 95% level.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
963 #PC2 Phred-scaled probability of the alternate allele frequency of group1 samples being larger (,smaller) than of group2 samples.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
964 #PCHI2 Posterior weighted chi^2 P-value between group1 and group2 samples. This P-value is conservative.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
965 #QCHI2 Phred-scaled PCHI2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
966 #RP Number of permutations yeilding a smaller PCHI2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
967
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
968 #example of triallelic variants generated by mpileup/bcftools
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
969 #1 156706559 . A C,G 114 . DP=20;AF1=1;CI95=1,1;DP4=0,0,1,19;MQ=60;FQ=-63 GT:PL:GQ 1/2:237,126,90,162,0,138:99
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
970 #6 31129642 . A G,C 76 . DP=31;AF1=1;CI95=1,1;DP4=0,0,28,3;MQ=60;FQ=-75 GT:PL:GQ 1/2:255,194,146,164,0,119:99
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
971 #1 11297762 . T C,A 98 . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78 GT:PL:GQ 1/1:131,51,0,120,28,117:99
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
972
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
973 my @field=split(/\t/,$_);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
974 @field >=8 or die "Error: invalid record found in VCF4 file (at least 8 tab-delimited fields expected): <$_>\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
975 my ($chr, $start, $ID, $ref_allele, $mut_allele, $quality_score, $filter, $info, $format, $sample) = @field;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
976 my ($end);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
977 my ($mut_allele2, $zygosity);
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
978
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
979 if ($filterword) { #ensure that the filter field contains the filterword
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
980 $filter =~ m/\b$filterword\b/i or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
981 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
982
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
983
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
984 #sometimes the alleles are not in the same case
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
985 #chr1 1869771 1869774 actc aCTctc 43.5 13 INDEL;DP=13;AF1=0.5;CI95=0.5,0.5;DP4=0,4,4,0;MQ=37;PV4=0.029,0.45,1,0.46
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
986 $ref_allele = uc $ref_allele;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
987 $mut_allele = uc $mut_allele;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
988
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
989 #if ($ID eq '.' || $ID =~ /^rs/) { #per MISHIMA, Hiroyuki suggestion (vcf4's third column (ID column) are not always ".")
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
990 # $end = $start; #this block is commented out on 2011feb19
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
991 #}
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
992
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
993 if ($mut_allele eq '.') { #no variant call was made at this position
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
994 next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
995 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
996
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
997 if ($mut_allele =~ m/([^,]+),([^,]+)/) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
998 $mut_allele = $1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
999 $mut_allele2 = $2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1000 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1001
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1002 if(length($ref_allele)==1 && length($mut_allele)==1) { ### output snv
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1003 my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1004 my ($MappingQuality) = $info =~ /MQ=([^;]+)/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1005 my ($QualityByDepth) = $info =~ /QD=([^;]+)/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1006
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1007
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1008
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1009 if ($coverage) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1010 defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1011 if ($maxcoverage) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1012 defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1013 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1014 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1015
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1016 if ($snpqual) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1017 defined $QualityByDepth and $QualityByDepth >= $snpqual || next; #the QD was used here as quality score
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1018 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1019
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1020 if (defined $sample) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1021 if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1022 $zygosity = 'het';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1023 $counthet++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1024 } elsif ($sample =~ m#^1/1#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1025 $zygosity = 'hom';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1026 $counthom++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1027 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1028 $zygosity = 'unknown';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1029 $countunknown++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1030 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1031 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1032 $zygosity = 'unknown';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1033 $countunknown++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1034 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1035
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1036 #the subject is called as homozygous for the first alternative allele (genotype 1/1. i.e. C/C), but since there was one read containing A, samtools still keep both alleles in the VCF file (but gives a very low probabilities for it).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1037 #1 11297762 . T C,A 98 . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78 GT:PL:GQ 1/1:131,51,0,120,28,117:99
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1038 if ($mut_allele2 and $zygosity eq 'hom') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1039 $mut_allele2 = '';
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1040 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1041
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1042 if (not $mut_allele2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1043 if ($ref_allele eq 'A' and $mut_allele eq 'G' or $ref_allele eq 'G' and $mut_allele eq 'A' or $ref_allele eq 'C' and $mut_allele eq 'T' or $ref_allele eq 'T' and $mut_allele eq 'C') {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1044 $countti++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1045
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1046 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1047 $counttv++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1048 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1049 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1050
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1051 print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele, "\t$zygosity", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1052
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1053 if ($allallele) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1054 if ($mut_allele2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1055 print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele2, "\t$zygosity", "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1056 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1057 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1058
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1059 $countsnp++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1060 } elsif (length($ref_allele) > 1 || length($mut_allele) > 1) { ### output indel
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1061 my ($indel_read_depth1, $indel_read_depth2) = $info =~ /AC=([^,;]+),([^,;]+)/; #number of reads supporting consensus indel, any indel
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1062 my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1063
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1064 if ($coverage) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1065 defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1066 if ($maxcoverage) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1067 defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1068 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1069 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1070
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1071 if (defined $indel_read_depth1 and defined $unfiltered_read_depth) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1072 $indel_read_depth1/$unfiltered_read_depth >= $fraction or next; #do not meet minimum alternative allele fraction threshold
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1073 $indel_read_depth1/$indel_read_depth2 >= $confraction or next;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1074 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1075
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1076 my ($MappingQuality) = $info =~ /MQ=([^;]+),/;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1077
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1078 #example VCF4 records below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1079 #20 2 . TCG T . PASS DP=100
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1080 #Chr1 5473 . AT ATT 23.5 . INDEL;DP=16;AF1=0.5;CI95=0.5,0.5;DP4=4,2,3,1;MQ=42;PV4=1,0.41,0.042,0.24
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1081 #Chr1 6498 . ATTTT ATTTTT 53.5 . INDEL;DP=9;AF1=1;CI95=1,1;DP4=0,0,5,3;MQ=28
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1082
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1083 if(length($ref_allele) > length ($mut_allele)) { # deletion or block substitution
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1084 my $head = substr($ref_allele, 0, length ($mut_allele));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1085 if ($head eq $mut_allele) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1086 print $chr,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1087 print $start+length($head),"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1088 print $start+length($ref_allele)-1,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1089
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1090 $ref_allele = substr ($ref_allele, length ($mut_allele));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1091 print $ref_allele,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1092 print "-";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1093 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1094 print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1095 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1096 } elsif(length($mut_allele) >= length ($ref_allele)) { # insertion or block substitution
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1097 my $head = substr ($mut_allele, 0, length ($ref_allele));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1098 if ($head eq $ref_allele) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1099 print $chr,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1100 print $start+length($ref_allele)-1,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1101 print $start+length($ref_allele)-1,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1102
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1103 $mut_allele = substr ($mut_allele, length ($ref_allele));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1104 print "-\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1105 print $mut_allele;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1106 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1107 print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1108 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1109 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1110
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1111 if (defined $sample) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1112 if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1113 print "\thet";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1114 $counthet++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1115 } elsif ($sample =~ m#^1/1#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1116 print "\thom";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1117 $counthom++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1118 } # BEGIN ARQ
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1119 elsif ($sample =~ m#^./.#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1120 print "\tunknown";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1121 $countunknown++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1122 } # END ARQ
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1123 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1124
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1125 print "\t", $quality_score;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1126 defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1127
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1128 defined $indel_read_depth1 and print "\t", $indel_read_depth1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1129 defined $MappingQuality and print "\t", $MappingQuality;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1130 $includeinfo and print "\t", $otherinfo;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1131 print "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1132 $countindel++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1133
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1134
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1135 #do the same thing again, exactly like above, except that we work on second mutation;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1136 #in the future, consider rewrite this paragraph to make the code more elegant
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1137 if ($allallele) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1138 if ($mut_allele2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1139 if(length($ref_allele) > length ($mut_allele2)) { # deletion or block substitution
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1140 my $head = substr($ref_allele, 0, length ($mut_allele2));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1141 if ($head eq $mut_allele2) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1142 print $chr,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1143 print $start+length($head),"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1144 print $start+length($ref_allele)-1,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1145
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1146 $ref_allele = substr ($ref_allele, length ($mut_allele2));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1147 print $ref_allele,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1148 print "-";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1149 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1150 print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1151 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1152 } elsif(length($mut_allele2) > length ($ref_allele)) { # insertion or block substitution
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1153 my $head = substr ($mut_allele2, 0, length ($ref_allele));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1154 if ($head eq $ref_allele) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1155 print $chr,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1156 print $start+length($ref_allele)-1,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1157 print $start+length($ref_allele)-1,"\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1158
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1159 $mut_allele = substr ($mut_allele2, length ($ref_allele));
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1160 print "-\t";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1161 print $mut_allele2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1162 } else {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1163 print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1164 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1165 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1166
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1167 if (defined $sample) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1168 if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1169 print "\thet";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1170 $counthet++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1171 } elsif ($sample =~ m#^1/1#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1172 print "\thom";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1173 $counthom++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1174 } # BEGIN ARQ
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1175 elsif ($sample =~ m#^./.#) {
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1176 print "\tunknown";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1177 $countunknown++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1178 } # END ARQ
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1179 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1180
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1181 print "\t", $quality_score;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1182 defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1183
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1184 defined $indel_read_depth1 and print "\t", $indel_read_depth1;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1185 defined $MappingQuality and print "\t", $MappingQuality;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1186 $includeinfo and print "\t", $otherinfo;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1187 print "\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1188
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1189 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1190 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1191 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1192 $countvar++;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1193 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1194 my $triallelic = $countsnp-$countti-$counttv;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1195 print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1196 print STDERR "NOTICE: Among ${\($counthet+$counthom+$countunknown)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1197 print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n";
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1198 }
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1199
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1200
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1201 =head1 SYNOPSIS
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1202
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1203 convert2annovar.pl [arguments] <variantfile>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1204
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1205 Optional arguments:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1206 -h, --help print help message
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1207 -m, --man print complete documentation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1208 -v, --verbose use verbose output
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1209 --format <string> input format (default: pileup)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1210 --outfile <file> output file name (default: STDOUT)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1211 --snpqual <float> quality score threshold in pileup file (default: 20)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1212 --snppvalue <float> SNP P-value threshold in GFF3-SOLiD file (default: 1)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1213 --coverage <int> read coverage threshold in pileup file (default: 0)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1214 --maxcoverage <int> maximum coverage threshold (default: none)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1215 --includeinfo include supporting information in output
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1216 --chr <string> specify the chromosome (for CASAVA format)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1217 --chrmt <string> chr identifier for mitochondria (default: M)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1218 --altcov <int> alternative allele coverage threshold (for pileup format)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1219 --fraction <float> minimum allelic fraction to claim a mutation (for pileup/vcf4_indel format)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1220 --species <string> if human, convert chr23/24/25 to X/Y/M (for gff3-solid format)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1221 --filter <string> output variants with this filter (case insensitive, for vcf4 format)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1222 --confraction <float> minimum consensus indel / all indel fraction (for vcf4_indel format)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1223 --allallele print all alleles when multiple calls are present (for vcf4 format)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1224
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1225 Function: convert variant call file generated from various software programs
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1226 into ANNOVAR input format
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1227
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1228 Example: convert2annovar.pl -format pileup -outfile variant.query variant.pileup
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1229 convert2annovar.pl -format cg -outfile variant.query variant.cg
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1230 convert2annovar.pl -format gff3-solid -outfile variant.query variant.snp.gff
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1231 convert2annovar.pl -format soap variant.snp > variant.avinput
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1232 convert2annovar.pl -format maq variant.snp > variant.avinput
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1233 convert2annovar.pl -format casava -chr 1 variant.snp > variant.avinput
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1234 convert2annovar.pl -format vcf4 variantfile > variant.avinput
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1235 convert2annovar.pl -format vcf4 -filter pass variantfile > variant.avinput
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1236
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1237 Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1238
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1239 =head1 OPTIONS
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1240
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1241 =over 8
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1242
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1243 =item B<--help>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1244
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1245 print a brief usage message and detailed explanation of options.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1246
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1247 =item B<--man>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1248
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1249 print the complete manual of the program.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1250
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1251 =item B<--verbose>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1252
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1253 use verbose output.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1254
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1255 =item B<--format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1256
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1257 the format of the input files.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1258
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1259 =item B<--outfile>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1260
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1261 specify the output file name. By default, output is written to STDOUT.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1262
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1263 =item B<--snpqual>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1264
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1265 quality score threshold in the pileup file, such that variant calls with lower
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1266 quality scores will not be printed out in the output file. When VCF4 file is
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1267 used, this argument works on the Quality-by-Depth measure, rather than the raw
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1268 quality measure.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1269
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1270 =item B<--coverage>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1271
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1272 read coverage threshold in the pileup file, such that variants calls generated
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1273 with lower coverage will not be printed in the output file.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1274
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1275 =item B<--includeinfo>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1276
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1277 specify that the output should contain additional information in the input line.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1278 By default, only the chr, start, end, reference allele, observed allele and
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1279 homozygosity status are included in output files.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1280
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1281 =item B<--chr>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1282
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1283 specify the chromosome for CASAVA format
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1284
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1285 =item B<--chrmt>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1286
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1287 specify the name of mitochondria chromosome (default is MT)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1288
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1289 =item B<--altcov>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1290
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1291 the minimum coverage of the alternative (mutated) allele to be printed out in
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1292 output
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1293
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1294 =item B<--fraction>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1295
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1296 specify the minimum fraction of alternative allele, to print out the mutation.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1297 For example, a site has 10 reads, 3 supports alternative allele. A -fraction of
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1298 0.4 will not allow the mutation to be printed out.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1299
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1300 =item B<--species>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1301
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1302 specify the species from which the sequencing data is obtained. For the GFF3-
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1303 SOLiD format, when species is human, the chromosome 23, 24 and 25 will be
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1304 converted to X, Y and M, respectively.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1305
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1306 =item B<--filter>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1307
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1308 for VCF4 file, only print out variant calls with this filter annotated. For
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1309 example, if using GATK VariantFiltration walker, you will see PASS,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1310 GATKStandard, HARD_TO_VALIDATE, etc in the filter field. Using 'pass' as a
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1311 filter is recommended in this case.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1312
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1313 =item B<--confraction>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1314
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1315 consesus indel fraction, calculated as reads supporting consensus indels divided
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1316 by reads supporting any indels
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1317
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1318 =item B<--allallele>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1319
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1320 print all alleles for mutations at a locus, rather than the first allele, if the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1321 input VCF4 file contains multiple alternative alleles for a mutation. By
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1322 default, this option is off. When it is on, two lines will be printed out in the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1323 output, and both will have the same quality scores as VCF4 does not provide
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1324 separate quality scores for individual alleles.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1325
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1326 =back
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1327
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1328 =head1 DESCRIPTION
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1329
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1330 This program is used to convert variant call file generated from various
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1331 software programs into ANNOVAR input format. Currently, the program can handle
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1332 Samtools genotype-calling pileup format, Solid GFF format, Complete Genomics
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1333 variant format, SOAP format. These formats are described below.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1334
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1335 =over 8
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1336
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1337 =item * B<pileup format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1338
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1339 The pileup format can be produced by the Samtools genotyping calling subroutine.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1340 Note that the phrase pileup format can be used in several instances, and here I
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1341 am only referring to the pileup files that contains the actual genotype calls.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1342
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1343 Using SamTools, given an alignment file in BAM format, a pileup file with
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1344 genotype calls can be produced by the command below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1345
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1346 samtools pileup -vcf ref.fa aln.bam> raw.pileup
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1347 samtools.pl varFilter raw.pileup > final.pileup
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1348
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1349 ANNOVAR will automatically filter the pileup file so that only SNPs reaching a
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1350 quality threshold are printed out (default is 20, use --snpqual argument to
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1351 change this). Most likely, users may want to also apply a coverage threshold,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1352 such that SNPs calls from only a few reads are not considered. This can be
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1353 achieved using the -coverage argument (default value is 0).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1354
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1355 An example of pileup files for SNPs is shown below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1356
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1357 chr1 556674 G G 54 0 60 16 a,.....,...,.... (B%A+%7B;0;%=B<:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1358 chr1 556675 C C 55 0 60 16 ,,..A..,...,.... CB%%5%,A/+,%....
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1359 chr1 556676 C C 59 0 60 16 g,.....,...,.... .B%%.%.?.=/%...1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1360 chr1 556677 G G 75 0 60 16 ,$,.....,...,.... .B%%9%5A6?)%;?:<
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1361 chr1 556678 G K 60 60 60 24 ,$.....,...,....^~t^~t^~t^~t^~t^~t^~t^~t^~t B%%B%<A;AA%??<=??;BA%B89
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1362 chr1 556679 C C 61 0 60 23 .....a...a....,,,,,,,,, %%1%&?*:2%*&)(89/1A@B@@
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1363 chr1 556680 G K 88 93 60 23 ..A..,..A,....ttttttttt %%)%7B:B0%55:7=>>A@B?B;
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1364 chr1 556681 C C 102 0 60 25 .$....,...,....,,,,,,,,,^~,^~. %%3%.B*4.%.34.6./B=?@@>5.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1365 chr1 556682 A A 70 0 60 24 ...C,...,....,,,,,,,,,,. %:%(B:A4%7A?;A><<999=<<
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1366 chr1 556683 G G 99 0 60 24 ....,...,....,,,,,,,,,,. %A%3B@%?%C?AB@BB/./-1A7?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1367
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1368 The columns are chromosome, 1-based coordinate, reference base, consensus base,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1369 consensus quality, SNP quality, maximum mapping quality of the reads covering
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1370 the sites, the number of reads covering the site, read bases and base qualities.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1371
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1372 An example of pileup files for indels is shown below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1373
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1374 seq2 156 * +AG/+AG 71 252 99 11 +AG * 3 8 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1375
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1376 ANNOVAR automatically recognizes both SNPs and indels in pileup file, and process them correctly.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1377
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1378 =item * B<GFF3-SOLiD format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1379
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1380 The SOLiD provides a GFF3-compatible format for SNPs, indels and structural
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1381 variants. A typical example file is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1382
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1383 ##gff-version 3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1384 ##solid-gff-version 0.3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1385 ##source-version 2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1386 ##type DNA
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1387 ##date 2009-03-13
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1388 ##time 0:0:0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1389 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1390 ##reference-file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1391 ##input-files Yoruban_snp_10x.txt
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1392 ##run-path
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1393 chr_name AB_SOLiD SNP caller SNP coord coord 1 . . coverage=# cov;ref_base=ref;ref_score=score;ref_confi=confi;ref_single=Single;ref_paired=Paired;consen_base=consen;consen_score=score;consen_confi=conf;consen_single=Single;consen_paired=Paired;rs_id=rs_id,dbSNP129
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1394 1 AB_SOLiD SNP caller SNP 997 997 1 . . coverage=3;ref_base=A;ref_score=0.3284;ref_confi=0.9142;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6716;consen_confi=0.9349;consen_single=0/0;consen_paired=2/2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1395 1 AB_SOLiD SNP caller SNP 2061 2061 1 . . coverage=2;ref_base=G;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.8985;consen_single=0/0;consen_paired=2/2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1396 1 AB_SOLiD SNP caller SNP 4770 4770 1 . . coverage=2;ref_base=A;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=G;consen_score=1.0000;consen_confi=0.8854;consen_single=0/0;consen_paired=2/2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1397 1 AB_SOLiD SNP caller SNP 4793 4793 1 . . coverage=14;ref_base=A;ref_score=0.0723;ref_confi=0.8746;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6549;consen_confi=0.8798;consen_single=0/0;consen_paired=9/9
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1398 1 AB_SOLiD SNP caller SNP 6241 6241 1 . . coverage=2;ref_base=T;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.7839;consen_single=0/0;consen_paired=2/2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1399
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1400 Newer version of ABI BioScope now use diBayes caller, and the output file is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1401
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1402 ##gff-version 3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1403 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1404 ##List of SNPs. Date Sat Dec 18 10:30:45 2010 Stringency: medium Mate Pair: 1 Read Length: 50 Polymorphism Rate: 0.003000 Bayes Coverage: 60 Bayes_Single_SNP: 1 Filter_Single_SNP: 1 Quick_P_Threshold: 0.997000 Bayes_P_Threshold: 0.040000 Minimum_Allele_Ratio: 0.150000 Minimum_Allele_Ratio_Multiple_of_Dicolor_Error: 100
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1405 ##1 chr1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1406 ##2 chr2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1407 ##3 chr3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1408 ##4 chr4
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1409 ##5 chr5
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1410 ##6 chr6
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1411 ##7 chr7
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1412 ##8 chr8
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1413 ##9 chr9
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1414 ##10 chr10
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1415 ##11 chr11
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1416 ##12 chr12
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1417 ##13 chr13
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1418 ##14 chr14
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1419 ##15 chr15
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1420 ##16 chr16
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1421 ##17 chr17
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1422 ##18 chr18
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1423 ##19 chr19
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1424 ##20 chr20
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1425 ##21 chr21
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1426 ##22 chr22
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1427 ##23 chrX
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1428 ##24 chrY
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1429 ##25 chrM
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1430 # source-version SOLiD BioScope diBayes(SNP caller)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1431 #Chr Source Type Pos_Start Pos_End Score Strand Phase Attributes
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1432 chr1 SOLiD_diBayes SNP 221367 221367 0.091151 . . genotype=R;reference=G;coverage=3;refAlleleCounts=1;refAlleleStarts=1;refAlleleMeanQV=29;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=27;diColor1=11;diColor2=33;het=1;flag=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1433 chr1 SOLiD_diBayes SNP 555317 555317 0.095188 . . genotype=Y;reference=T;coverage=13;refAlleleCounts=11;refAlleleStarts=10;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=00;diColor2=22;het=1;flag=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1434 chr1 SOLiD_diBayes SNP 555327 555327 0.037582 . . genotype=Y;reference=T;coverage=12;refAlleleCounts=6;refAlleleStarts=6;refAlleleMeanQV=19;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=12;diColor2=30;het=1;flag=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1435 chr1 SOLiD_diBayes SNP 559817 559817 0.094413 . . genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1436 chr1 SOLiD_diBayes SNP 714068 714068 0.000000 . . genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag=
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1437 The file conforms to standard GFF3 specifications, but the last column is solid-
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1438 specific and it gives certain parameters for the SNP calls.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1439
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1440 An example of the short indel format by GFF3-SOLiD is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1441
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1442 ##gff-version 3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1443 ##solid-gff-version 0.3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1444 ##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1445 ##type DNA
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1446 ##date 2009-01-26
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1447 ##time 18:33:20
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1448 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1449 ##reference-file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1450 ##input-files ../../mp-results/JOAN_20080104_1.pas,../../mp-results/BARB_20071114_1.pas,../../mp-results/BARB_20080227_2.pas
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1451 ##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-w2x25-2x-4x-8x-10x/2x
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1452 ##Filter-settings: max-ave-read-pos=none,min-ave-from-end-pos=9.1,max-nonreds-4filt=2,min-insertion-size=none,min-deletion-size=none,max-insertion-size=none,max-deletion-size=none,require-called-indel-size?=T
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1453 chr1 AB_SOLiD Small Indel Tool deletion 824501 824501 1 . . del_len=1;tight_chrom_pos=824501-824502;loose_chrom_pos=824501-824502;no_nonred_reads=2;no_mismatches=1,0;read_pos=4,6;from_end_pos=21,19;strands=+,-;tags=R3,F3;indel_sizes=-1,-1;read_seqs=G3021212231123203300032223,T3321132212120222323222101;dbSNP=rs34941678,chr1:824502-824502(-),EXACT,1,/GG
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1454 chr1 AB_SOLiD Small Indel Tool insertion_site 1118641 1118641 1 . . ins_len=3;tight_chrom_pos=1118641-1118642;loose_chrom_pos=1118641-1118642;no_nonred_reads=2;no_mismatches=0,1;read_pos=17,6;from_end_pos=8,19;strands=+,+;tags=F3,R3;indel_sizes=3,3;read_seqs=T0033001100022331122033112,G3233112203311220000001002
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1455
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1456 The keyword deletion or insertion_site is used in the fourth column to indicate
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1457 that file format.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1458
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1459 An example of the medium CNV format by GFF3-SOLiD is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1460
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1461 ##gff-version 3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1462 ##solid-gff-version 0.3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1463 ##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1464 ##type DNA
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1465 ##date 2009-01-27
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1466 ##time 15:54:36
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1467 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1468 ##reference-file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1469 ##input-files big_d20e5-del12n_up-ConsGrp-2nonred.pas.sum
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1470 ##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-results-lmp-e5/big_d20e5-indel_950_2050
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1471 chr1 AB_SOLiD Small Indel Tool deletion 3087770 3087831 1 . . del_len=62;tight_chrom_pos=none;loose_chrom_pos=3087768-3087773;no_nonred_reads=2;no_mismatches=2,2;read_pos=27,24;from_end_pos=23,26;strands=-,+;tags=F3,F3;indel_sizes=-62,-62;read_seqs=T11113022103331111130221213201111302212132011113022,T02203111102312122031111023121220311111333012203111
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1472 chr1 AB_SOLiD Small Indel Tool deletion 4104535 4104584 1 . . del_len=50;tight_chrom_pos=4104534-4104537;loose_chrom_pos=4104528-4104545;no_nonred_reads=3;no_mismatches=0,4,4;read_pos=19,19,27;from_end_pos=31,31,23;strands=+,+,-;tags=F3,R3,R3;indel_sizes=-50,-50,-50;read_seqs=T31011011013211110130332130332132110110132020312332,G21031011013211112130332130332132110132132020312332,G20321302023001101123123303103303101113231011011011
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1473 chr1 AB_SOLiD Small Indel Tool insertion_site 2044888 2044888 1 . . ins_len=18;tight_chrom_pos=2044887-2044888;loose_chrom_pos=2044887-2044889;no_nonred_reads=2;bead_ids=1217_1811_209,1316_908_1346;no_mismatches=0,2;read_pos=13,15;from_end_pos=37,35;strands=-,-;tags=F3,F3;indel_sizes=18,18;read_seqs=T31002301231011013121000101233323031121002301231011,T11121002301231011013121000101233323031121000101231;non_indel_no_mismatches=3,1;non_indel_seqs=NIL,NIL
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1474 chr1 AB_SOLiD Small Indel Tool insertion_site 74832565 74832565 1 . . ins_len=16;tight_chrom_pos=74832545-74832565;loose_chrom_pos=74832545-74832565;no_nonred_reads=2;bead_ids=1795_181_514,1651_740_519;no_mismatches=0,2;read_pos=13,13;from_end_pos=37,37;strands=-,-;tags=F3,R3;indel_sizes=16,16;read_seqs=T33311111111111111111111111111111111111111111111111,G23311111111111111111111111111111111111111311011111;non_indel_no_mismatches=1,0;non_indel_seqs=NIL,NIL
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1475
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1476 An example of the large indel format by GFF3-SOLiD is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1477
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1478 ##gff-version 3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1479 ##solid-gff-version 0.3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1480 ##source-version ???
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1481 ##type DNA
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1482 ##date 2009-03-13
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1483 ##time 0:0:0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1484 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1485 ##reference-file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1486 ##input-files /data/results5/yoruban_strikes_back_large_indels/LMP/five_mm_unique_hits_no_rescue/5_point_6x_del_lib_1/results/NA18507_inter_read_indels_5_point_6x.dat
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1487 ##run-path
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1488 chr1 AB_SOLiD Large Indel Tool insertion_site 1307279 1307791 1 . . deviation=-742;stddev=7.18;ref_clones=-;dev_clones=4
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1489 chr1 AB_SOLiD Large Indel Tool insertion_site 2042742 2042861 1 . . deviation=-933;stddev=8.14;ref_clones=-;dev_clones=3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1490 chr1 AB_SOLiD Large Indel Tool insertion_site 2443482 2444342 1 . . deviation=-547;stddev=11.36;ref_clones=-;dev_clones=17
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1491 chr1 AB_SOLiD Large Indel Tool insertion_site 2932046 2932984 1 . . deviation=-329;stddev=6.07;ref_clones=-;dev_clones=14
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1492 chr1 AB_SOLiD Large Indel Tool insertion_site 3166925 3167584 1 . . deviation=-752;stddev=13.81;ref_clones=-;dev_clones=14
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1493
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1494 An example of the CNV format by GFF3-SOLiD if given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1495
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1496 ##gff-version 3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1497 ##solid-gff-version 0.3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1498 ##source-version ???
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1499 ##type DNA
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1500 ##date 2009-03-13
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1501 ##time 0:0:0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1502 ##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1503 ##reference-file
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1504 ##input-files Yoruban_cnv.coords
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1505 ##run-path
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1506 chr1 AB_CNV_PIPELINE repeat_region 1062939 1066829 . . . fraction_mappable=51.400002;logratio=-1.039300;copynum=1;numwindows=1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1507 chr1 AB_CNV_PIPELINE repeat_region 1073630 1078667 . . . fraction_mappable=81.000000;logratio=-1.409500;copynum=1;numwindows=2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1508 chr1 AB_CNV_PIPELINE repeat_region 2148325 2150352 . . . fraction_mappable=98.699997;logratio=-1.055000;copynum=1;numwindows=1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1509 chr1 AB_CNV_PIPELINE repeat_region 2245558 2248109 . . . fraction_mappable=78.400002;logratio=-1.042900;copynum=1;numwindows=1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1510 chr1 AB_CNV_PIPELINE repeat_region 3489252 3492632 . . . fraction_mappable=59.200001;logratio=-1.119900;copynum=1;numwindows=1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1511 chr1 AB_CNV_PIPELINE repeat_region 5654415 5657276 . . . fraction_mappable=69.900002;logratio=1.114500;copynum=4;numwindows=1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1512 chr1 AB_CNV_PIPELINE repeat_region 9516165 9522726 . . . fraction_mappable=65.850006;logratio=-1.316700;numwindows=2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1513 chr1 AB_CNV_PIPELINE repeat_region 16795117 16841025 . . . fraction_mappable=44.600002;logratio=1.880778;copynum=7;numwindows=9
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1514
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1515 The keyword repeat_region is used here, although it actually refers to CNVs.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1516
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1517 An example of the inversion format by GFF3-SOLiD is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1518
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1519 ##gff-version 3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1520 ##solid-gff-version 0.2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1521 ##generated by SOLiD inversion tool
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1522 chr10 AB_SOLiD inversion 46443107 46479585 268.9 . . left=chr10:46443107-46443146;right=chr10:46479583-46479585;leftscore=295.0;rightscore=247.0;count_AAA_further_left=117;count_AAA_left=3;count_AAA_right=3;count_AAA_further_right=97;left_min_count_AAA=chr10:46443107-46443112;count_AAA_min_left=0;count_AAA_max_left=3;right_min_count_AAA=chr10:46479585-46479585;count_AAA_min_right=1;count_AAA_max_right=3;homozygous=UNKNOWN
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1523 chr4 AB_SOLiD inversion 190822813 190850112 214.7 . . left=chr4:190822813-190822922;right=chr4:190850110-190850112;leftscore=140.0;rightscore=460.0;count_AAA_further_left=110;count_AAA_left=78;count_AAA_right=74;count_AAA_further_right=77;left_min_count_AAA=chr4:190822813-190822814;count_AAA_min_left=69;count_AAA_max_left=77;right_min_count_AAA=chr4:190850110-190850112;count_AAA_min_right=74;count_AAA_max_right=74;homozygous=NO
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1524 chr6 AB_SOLiD inversion 168834969 168837154 175.3 . . left=chr6:168834969-168835496;right=chr6:168836643-168837154;leftscore=185.4;rightscore=166.2;count_AAA_further_left=67;count_AAA_left=43;count_AAA_right=40;count_AAA_further_right=59;left_min_count_AAA=chr6:168835058-168835124,chr6:168835143-168835161,chr6:168835176-168835181,chr6:168835231-168835262;count_AAA_min_left=23;count_AAA_max_left=29;right_min_count_AAA=chr6:168836643-168836652;count_AAA_min_right=23;count_AAA_max_right=31;homozygous=NO
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1525
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1526 The program should be able to recognize all the above GFF3-SOLiD format
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1527 automatically, and handle them accordingly.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1528
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1529 =item * B<Complete Genomics format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1530
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1531 This format is provided by the Complete Genomics company to their customers. The
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1532 file var-[ASM-ID].tsv.bz2 includes a description of all loci where the assembled
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1533 genome differs from the reference genome.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1534
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1535 An example of the Complete Genomics format is shown below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1536
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1537 #BUILD 1.5.0.5
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1538 #GENERATED_AT 2009-Nov-03 19:52:21.722927
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1539 #GENERATED_BY dbsnptool
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1540 #TYPE VAR-ANNOTATION
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1541 #VAR_ANN_SET /Proj/Pipeline/Production_Data/REF/HUMAN-F_06-REF/dbSNP.csv
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1542 #VAR_ANN_TYPE dbSNP
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1543 #VERSION 0.3
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1544
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1545 >locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1546 1 2 all chr1 0 959 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1547 2 2 all chr1 959 972 = = =
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1548 3 2 all chr1 972 1001 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1549 4 2 all chr1 1001 1008 = = =
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1550 5 2 all chr1 1008 1114 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1551 6 2 all chr1 1114 1125 = = =
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1552 7 2 all chr1 1125 1191 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1553 8 2 all chr1 1191 1225 = = =
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1554 9 2 all chr1 1225 1258 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1555 10 2 all chr1 1258 1267 = = =
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1556 12 2 all chr1 1267 1275 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1557 13 2 all chr1 1275 1316 = = =
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1558 14 2 all chr1 1316 1346 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1559 15 2 all chr1 1346 1367 = = =
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1560 16 2 all chr1 1367 1374 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1561 17 2 all chr1 1374 1388 = = =
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1562 18 2 all chr1 1388 1431 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1563 19 2 all chr1 1431 1447 = = =
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1564 20 2 all chr1 1447 1454 no-call = ?
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1565
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1566 The following information is provided in documentation from Complete Genomics, that describes the var-ASM format.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1567
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1568 1. locus. Identifier of a particular genomic locus
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1569 2. ploidy. The ploidy of the reference genome at the locus (= 2 for autosomes, 2 for pseudoautosomal regions on the sex chromosomes, 1 for males on the non-pseudoautosomal parts of the sex chromosomes, 1 for mitochondrion, '?' if varType is 'no-ref' or 'PAR-called-in-X'). The reported ploidy is fully determined by gender, chromosome and location, and is not inferred from the sequence data.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1570 3. haplotype. Identifier for each haplotype at the variation locus. For diploid genomes, 1 or 2. Shorthand of 'all' is allowed where the varType field is one of 'ref', 'no-call', 'no-ref', or 'PAR-called-in-X'. Haplotype numbering does not imply phasing; haplotype 1 in locus 1 is not necessarily in phase with haplotype 1 in locus 2. See hapLink, below, for phasing information.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1571 4. chromosome. Chromosome name in text: 'chr1','chr2', ... ,'chr22','chrX','chrY'. The mitochondrion is represented as 'chrM'. The pseudoautosomal regions within the sex chromosomes X and Y are reported at their coordinates on chromosome X.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1572 5. begin. Reference coordinate specifying the start of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1573 6. end. Reference coordinate specifying the end of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1574 7. varType. Type of variation, currently one of:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1575 snp: single-nucleotide polymorphism
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1576 ins: insertion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1577 del: deletion
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1578 sub: Substitution of one or more reference bases with the bases in the allele column
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1579 'ref' : no variation; the sequence is identical to the reference sequence on the indicated haplotype
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1580 no-call-rc: 'no-call reference consistent 'one or more bases are ambiguous, but the allele is potentially consistent with the reference
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1581 no-call-ri: 'no-call reference inconsistent' one or more bases are ambiguous, but the allele is definitely inconsistent with the reference
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1582 no-call: an allele is completely indeterminate in length and composition, i.e. alleleSeq = '?'
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1583 no-ref: the reference sequence is unspecified at this locus.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1584 PAR-called-in-X: this locus overlaps one of the pseudoautosomal regions on the sex chromosomes. The called sequence is reported as diploid sequence on Chromosome X; on chromosome Y the sequence is reported as varType = 'PAR-called-in-X'.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1585 8. reference. The reference sequence for the locus of variation. Empty when varType is ins. A value of '=' indicates that the user must consult the reference for the sequence; this shorthand is only used in regions where no haplotype deviates from the reference sequence.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1586 9. alleleSeq. The observed sequence at the locus of variation. Empty when varType is del. '?' isused to indicate 0 or more unknown bases within the sequence; 'N' is used to indicate exactly one unknown base within the sequence.'=' is used as shorthand to indicate identity to the reference sequence for non-variant sequence, i.e. when varType is 'ref'.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1587 10. totalScore. A score corresponding to a single variation and haplotype, representing the confidence in the call.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1588 11. hapLink. Identifier that links a haplotype at one locus to haplotypes at other loci. Currently only populated for very proximate variations that were assembled together. Two calls that share a hapLink identifier are expected to be on the same haplotype,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1589 12. xRef. Field containing external variation identifiers, currently only populated for variations corroborated directly by dbSNP. Format: dbsnp:[rsID], with multiple entries separated by the semicolon (;).
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1590
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1591 In older versions of the format specification, the sub keyword used to be insdel
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1592 keyword. ANNOVAR takes care of this.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1593
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1594 =item * B<SOAPsnp format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1595
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1596 An example of the SOAP SNP caller format is shown below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1597
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1598 chr8 35782 A R 1 A 27 1 2 G 26 1 2 5 0.500000 2.00000 1 5
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1599 chr8 35787 G R 0 G 25 4 6 A 17 2 4 10 0.266667 1.60000 0 5
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1600
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1601 The following information is provided in documentation from BGI who developed
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1602 SOAP suite. It differs slightly from the description at the SOAPsnp website, and
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1603 presumably the website is outdated.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1604
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1605 Format description:(left to right)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1606 1. Chromosome name
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1607 2. Position of locus
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1608 3. Nucleotide at corresponding locus of reference sequence
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1609 4. Genotype of sequencing sample
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1610 5. Quality value
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1611 6. nucleotide with the highest probability(first nucleotide)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1612 7. Quality value of the nucleotide with the highest probability
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1613 8. Number of supported reads that can only be aligned to this locus
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1614 9. Number of all supported reads that can be aligned to this locus
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1615 10. Nucleotide with higher probability
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1616 11. Quality value of nucleotide with higher probability
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1617 12. Number of supported reads that can only be aligned to this locus
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1618 13. Number of all supported reads that can be aligned to this locus
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1619 14. Total number of reads that can be aligned to this locus
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1620 15. Order and quality value
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1621 16. Estimated copy number for this locus
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1622 17. Presence of this locus in the dbSNP database. 1 refers to presence and 0 refers to inexistence
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1623 18. The distance between this locus and another closest SNP
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1624
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1625 =item * B<SOAPindel format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1626
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1627 The current version of ANNOVAR handles SoapSNP and SoapIndel automatically via a
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1628 single argument '--format soap'. An example of SOAP indel caller format is shown
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1629 below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1630
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1631 chr11 44061282 - +2 CT Hete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1632 chr11 45901572 + +1 C Hete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1633 chr11 48242562 * -3 TTC Homo
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1634 chr11 57228723 * +4 CTTT Homo
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1635 chr11 57228734 * +4 CTTT Homo
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1636 chr11 57555685 * -1 C Hete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1637 chr11 61482191 - +3 TCC Hete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1638 chr11 64608031 * -1 T Homo
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1639 chr11 64654936 * +1 C Homo
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1640 chr11 71188303 + -1 T Hete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1641 chr11 75741034 + +1 T Hete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1642 chr11 76632438 * +1 A Hete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1643 chr11 89578266 * -2 AG Homo
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1644 chr11 104383261 * +1 T Hete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1645 chr11 124125940 + +4 CCCC Hete
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1646 chr12 7760052 * +1 T Homo
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1647 chr12 8266049 * +3 ACG Homo
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1648
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1649 I do not see a documentation describing this format yet as of September 2010.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1650
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1651 =item B<--SOAPsv format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1652
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1653 An example is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1654
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1655 Chr2 Deletion 42894 43832 43167 43555 388 0-0-0 FR 41
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1656
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1657 An explanation of the structural variation format is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1658
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1659 Format description (from left to right)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1660 1. Chromosome name
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1661 2. Type of structure variation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1662 3. Minimal value of start position in cluster
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1663 4. Maximal value of end position in cluster
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1664 5. Estimated start position of this structure variation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1665 6. Estimated end position of this structure variation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1666 7. Length of SV
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1667 8. Breakpoint of SV (only for insertion)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1668 9. Unusual matching mode (F refers to align with forward sequence, R refers
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1669 to align with reverse
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1670 sequence)
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1671 10. number of paired-end read which support this structure variation
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1672
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1673 =item * B<MAQ format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1674
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1675 MAQ can perform alignment and generate genotype calls, including SNP calls and
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1676 indel calls. The format is described below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1677
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1678 For indel header: The output is TAB delimited with each line consisting of chromosome, start
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1679 position, type of the indel, number of reads across the indel, size of the indel
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1680 and inserted/deleted nucleotides (separated by colon), number of indels on the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1681 reverse strand, number of indels on the forward strand, 5' sequence ahead of the
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1682 indel, 3' sequence following the indel, number of reads aligned without indels
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1683 and three additional columns for filters.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1684
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1685 An example is below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1686
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1687 chr10 110583 - 2 -2:AG 0 1 GCGAGACTCAGTATCAAAAAAAAAAAAAAAAA AGAAAGAAAGAAAAAGAAAAAAATAGAAAGAA 1 @2, @72, @0,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1688 chr10 120134 - 8 -2:CA 0 1 CTCTTGCCCGCTCACACATGTACACACACGCG CACACACACACACACACATCAGCTACCTACCT 7 @65,62,61,61,45,22,7, @9,12,13,13,29,52,67, @0,0,0,0,0,0,0,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1689 chr10 129630 - 1 -1:T 1 0 ATGTTGTGACTCTTAATGGATAAGTTCAGTCA TTTTTTTTTAGCTTTTAACCGGACAAAAAAAG 0 @ @ @
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1690 chr10 150209 - 1 4:TTCC 1 0 GCATATAGGGATGGGCACTTTACCTTTCTTTT TTCCTTCCTTCCTTCCTTCCCTTTCCTTTCCT 0 @ @ @
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1691 chr10 150244 - 2 -4:TTCT 0 1 CTTCCTTCCTTCCTTCCCTTTCCTTTCCTTTC TTCTTTCTTTCTTTCTTTCTTTTTTTTTTTTT 0 @ @ @
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1692 chr10 159622 - 1 3:AGG 0 1 GAAGGAGGAAGGACGGAAGGAGGAAGGAAGGA AGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGA 0 @ @ @
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1693 chr10 206372 - 2 2:GT 1 0 ATAATAGTAACTGTGTATTTGATTATGTGTGC GTGTGTGTGTGTGTGTGTGTGTGTGCGTGCTT 1 @37, @37, @8,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1694 chr10 245751 - 11 -1:C 0 1 CTCATAAATACAAGTCATAATGAAAGAAATTA CCACCATTTTCTTATTTTCATTCATTTTTAGT 10 @69,64,53,41,30,25,22,14,5,4, @5,10,21,33,44,49,52,60,69,70, @0,0,0,0,0,0,0,0,0,0,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1695 chr10 253066 - 1 2:TT 0 1 TATTGATGAGGGTGGATTATACTTTAGAACAC TATTCAAACAGTTCTTCCACATATCTCCCTTT 0 @ @ @
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1696 chr10 253455 - 2 -3:AAA 1 0 GTTGCACTCCAGCCTGGCGAGATTCTGTCTCC AAAAAAAAAAAAAAAAATTGTTGTGAAATACA 1 @55, @19, @4,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1697
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1698 For snp output file: Each line consists of chromosome, position, reference base,
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1699 consensus base, Phred-like consensus quality, read depth, the average number of
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1700 hits of reads covering this position, the highest mapping quality of the reads
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1701 covering the position, the minimum consensus quality in the 3bp flanking regions
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1702 at each side of the site (6bp in total), the second best call, log likelihood
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1703 ratio of the second best and the third best call, and the third best call.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1704
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1705 An example is below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1706
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1707 chr10 83603 C T 28 12 2.81 63 34 Y 26 C
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1708 chr10 83945 G R 59 61 4.75 63 62 A 47 G
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1709 chr10 83978 G R 47 40 3.31 63 62 A 21 G
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1710 chr10 84026 G R 89 22 2.44 63 62 G 49 A
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1711 chr10 84545 C T 54 9 1.69 63 30 N 135 N
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1712 chr10 85074 G A 42 5 1.19 63 38 N 108 N
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1713 chr10 85226 A T 42 5 1.00 63 42 N 107 N
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1714 chr10 85229 C T 42 5 1.00 63 42 N 112 N
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1715 chr10 87518 A G 39 4 3.25 63 38 N 9 N
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1716 chr10 116402 T C 39 4 1.00 63 38 N 76 N
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1717
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1718
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1719 =item * B<CASAVA format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1720
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1721 An example of Illumina CASAVA format is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1722
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1723 #position A C G T modified_call total used score reference type
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1724 14930 3 0 8 0 GA 11 11 29.10:11.10 A SNP_het2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1725 14933 4 0 7 0 GA 11 11 23.50:13.70 G SNP_het1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1726 14976 3 0 8 0 GA 11 11 24.09:9.10 G SNP_het1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1727 15118 2 1 4 0 GA 8 7 10.84:6.30 A SNP_het2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1728
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1729 An example of the indels is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1730
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1731 # ** CASAVA depth-filtered indel calls **
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1732 #$ CMDLINE /illumina/pipeline/install/CASAVA_v1.7.0/libexec/CASAVA-1.7.0/filterIndelCalls.pl--meanReadDepth=2.60395068970547 --indelsCovCutoff=-1 --chrom=chr1.fa /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0000.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0001.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0002.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0003.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0004.txt
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1733 #$ CHROMOSOME chr1.fa
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1734 #$ MAX_DEPTH undefined
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1735 #
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1736 #$ COLUMNS pos CIGAR ref_upstream ref/indel ref_downstream Q(indel) max_gtype Q(max_gtype) max2_gtype bp1_reads ref_reads indel_reads other_reads repeat_unit ref_repeat_count indel_repeat_count
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1737 948847 1I CCTCAGGCTT -/A ATAATAGGGC 969 hom 47 het 22 0 16 6 A 1 2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1738 978604 2D CACTGAGCCC CT/-- GTGTCCTTCC 251 hom 20 het 8 0 4 4 CT 1 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1739 1276974 4I CCTCATGCAG ----/ACAC ACACATGCAC 838 hom 39 het 18 0 14 4 AC 2 4
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1740 1289368 2D AGCCCGGGAC TG/-- GGAGCCGCGC 1376 hom 83 het 33 0 25 9 TG 1 0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1741
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1742 =item * B<VCF4 format>
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1743
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1744 VCF4 can be used to describe both population-level variation information, or for
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1745 reads derived from a single individual.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1746
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1747 One example of the indel format for one individual is given below:
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1748
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1749 ##fileformat=VCFv4.0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1750 ##IGv2_bam_file_used=MIAPACA2.alnReAln.bam
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1751 ##INFO=<ID=AC,Number=2,Type=Integer,Description="# of reads supporting consensus indel/any indel at the site">
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1752 ##INFO=<ID=DP,Number=1,Type=Integer,Description="total coverage at the site">
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1753 ##INFO=<ID=MM,Number=2,Type=Float,Description="average # of mismatches per consensus indel-supporting read/per reference-supporting read">
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1754 ##INFO=<ID=MQ,Number=2,Type=Float,Description="average mapping quality of consensus indel-supporting reads/reference-supporting reads">
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1755 ##INFO=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads">
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1756 ##INFO=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads">
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1757 ##INFO=<ID=SC,Number=4,Type=Integer,Description="strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads">
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1758 ##IndelGenotyperV2=""
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1759 ##reference=hg18.fa
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1760 ##source=IndelGenotyperV2
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1761 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Miapaca_trimmed_sorted.bam
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1762 chr1 439 . AC A . PASS AC=5,5;DP=7;MM=7.0,3.0;MQ=23.4,1.0;NQSBQ=23.98,25.5;NQSMM=0.04,0.0;SC=2,3,0,2 GT 1/0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1763 chr1 714048 . T TCAAC . PASS AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.266666,21.932203;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT 0/1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1764 chr1 714049 . G GC . PASS AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.233334,21.83051;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT 0/1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1765 chr1 813675 . A AATAG . PASS AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=25.74,25.166666;NQSMM=0.0,0.033333335;SC=4,1,1,2 GT 0/1
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1766 chr1 813687 . AGAGAGAGAGAAG A . PASS AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=24.54,25.2;NQSMM=0.02,0.06666667;SC=4,1,1,2 GT 1/0
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1767
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1768
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1769 =back
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1770
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1771 The code was written by Dr. Kai Wang and modified by Dr. Germán Gastón Leparc.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1772 Various users have provided sample input files for many SNP callin software, for
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1773 the development of conversion subroutines. We thank these users for their
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1774 continued support to improve the functionality of the script.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1775
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1776 For questions or comments, please contact kai@openbioinformatics.org.
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1777
f753b30013e6 Uploaded
rdaveau
parents:
diff changeset
1778 =cut