gfap: gfapts/inc/annovar/convert2annovar.pl comparison

comparison gfapts/inc/annovar/convert2annovar.pl @ 0:f753b30013e6 draft

Uploaded

author	rdaveau
date	Fri, 29 Jun 2012 10:20:55 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:f753b30013e6
+#!/usr/bin/perl
+use warnings;
+use strict;
+use Getopt::Long;
+use Pod::Usage;
+our $VERSION = 			'$Revision: 466 $';
+our $LAST_CHANGED_DATE =	'$LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $';
+our ($verbose, $help, $man);
+our ($variantfile);
+our ($outfile, $format, $includeinfo, $snpqual, $snppvalue, $coverage, $maxcoverage, $chr, $chrmt, $altcov, $fraction, $species, $filterword, $confraction, $allallele);
+our %iupac = (R=>'AG', Y=>'CT', S=>'GC', W=>'AT', K=>'GT', M=>'AC', A=>'AA', C=>'CC', G=>'GG', T=>'TT', B=>'CGT', D=>'AGT', H=>'ACT', V=>'ACG', N=>'ACGT', '.'=>'-', '-'=>'-');
+GetOptions('verbose'=>\$verbose, 'help|h'=>\$help, 'man'=>\$man, 'outfile=s'=>\$outfile, 'format=s'=>\$format, 'includeinfo'=>\$includeinfo,
+	'snpqual=f'=>\$snpqual, 'snppvalue=f'=>\$snppvalue, 'coverage=i'=>\$coverage, 'maxcoverage=i'=>\$maxcoverage, 'chr=s'=>\$chr, 'chrmt=s'=>\$chrmt,
+	'fraction=f'=>\$fraction, 'altcov=i'=>\$altcov,
+	'species'=>\$species, 'filter=s'=>\$filterword, 'confraction=f'=>\$confraction, 'allallele!'=>\$allallele) or pod2usage ();
+$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
+$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
+@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
+@ARGV == 1 or pod2usage ("Syntax error");
+($variantfile) = @ARGV;
+$chrmt ||= 'M';
+if (not $format) {
+	$format = 'pileup';
+	print STDERR "NOTICE: the default --format argument is set as 'pileup'\n";
+}
+if (defined $outfile) {
+	open (STDOUT, ">$outfile") or die "Error: cannot write to output file $outfile: $!\n";
+}
+defined $snpqual and $format eq 'pileup' || $format eq 'vcf4' || pod2usage ("Error in argument: the --snpqual is supported only for the 'pileup' or 'vcf4' format");
+defined $snppvalue and $format eq 'gff3-solid' || pod2usage ("Error in argument: the --snppvalue is supported only for the 'gff3-solid' format");
+if (not defined $snpqual and $format eq 'pileup') {
+	$snpqual = 20;
+	print STDERR "NOTICE: the default --snpqual argument for pileup format is set as 20\n";
+}
+if (not defined $snppvalue) {
+	$snppvalue = 1;		#by default, do not use any of the P-value cutoff in filtering out GFF3-SOLID files (this is differnt from handling pileup files)
+}
+if (not defined $coverage) {
+	$coverage = 0;
+}
+if (defined $fraction) {
+	$format eq 'pileup' or $format eq 'vcf4' or pod2usage ("Error in argument: the '--fraction' argument is supported for the pileup or vcf4 format only");
+	$format eq 'vcf4' and print STDERR "NOTICE: the --fraction argument works ONLY on indels for vcf4 format\n";
+	$fraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --fraction argument must be between 0 and 1 inclusive");
+} else {
+	$fraction = 0;
+}
+if (defined $confraction) {
+	$format eq 'vcf4' and print STDERR "NOTICE: the --confraction argument works ONLY on indels for vcf4 format\n";
+	$confraction >= 0 and $fraction <=1 or pod2suage ("Error in argument: the --confraction argument must be between 0 and 1 inclusive");
+} else {
+	$confraction = 0;
+}
+if (defined $altcov) {
+	$format eq 'pileup' or pod2usage ("Error in argument: the '--altcov' argument is supported for the '--format pileup' only");
+	$altcov < $coverage or pod2suage ("Error in argument: the --altcov argument must be less than --coverage");
+	$altcov > 0 or pod2suage ("Error in argument: the --altcov argument must be a positive integer");
+}
+if (defined $species) {
+	$format eq 'gff3-solid' or pod2usage ("Error in argument: the '--species' argument is only necessary for the '--format gff3-solid'");
+}
+if ($allallele) {
+	$format eq 'vcf4' or pod2usage ("Error in argument: the '--allallele' argument is only supported for the '--format vcf4'");
+}
+if ($format eq 'pileup') {
+	convertPileup ($variantfile);
+} elsif ($format eq 'cg') {
+	convertCG ($variantfile);
+} elsif ($format eq 'gff3-solid') {
+	convertGFF3SolidSNP ($variantfile);
+} elsif ($format eq 'soap') {
+	print STDERR "WARNING: the support for '--format soap' is not well developed yet and may contain bugs for indel analysis.\n";
+	convertSOAP ($variantfile);
+} elsif ($format eq 'maq') {
+	print STDERR "WARNING: the support for '--format maq' is not well developed yet and may contain bugs.\n";
+	convertMAQSNP ($variantfile);
+} elsif ($format eq 'casava') {
+	if (not defined $chr) {
+		pod2usage ("Error in argument: please supply --chr argument for the '--format casava'");
+	}
+	convertCASAVA ($variantfile, $chr);
+} elsif ($format eq 'vcf4') {
+	convertVCF4 ($variantfile);
+} elsif ($format eq 'annovar') {
+	convertANNOVAR ($variantfile);
+} else {
+	pod2usage ("Error in argument: the --format $format is not currently supported. Please contact ANNOVAR developer for adding the support");
+}
+sub convertPileup {
+	my ($variantfile) = @_;
+	my ($countline, $countvar, $counthom, $counthet, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0/;
+	if ($variantfile eq 'stdin') {
+		*VAR = *STDIN;
+	} else {
+		open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
+	}
+	print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, SNP quality, total reads, reads with mutation\n";
+	while (<VAR>) {
+		s/[\r\n]+$//;
+		$countline++;
+		my $hom = 'hom';
+		my @field = split (/\t/, $_);
+		@field >= 10 or die "Error: invalid record found in pileupfile $variantfile (at least 10 fields expected): <$_>\n";
+		my ($chr, $pos, $wt, $call, @other) = @field;
+		my ($cons_qual, $snp_quality, $readcount, $readallele) = @field[4,5,7,8];
+		$chr =~ s/^chr//;
+		$wt = uc $wt;					#wt may or may not in upper case, it depends on the input FASTA file
+		$call = uc $call;				#usually call is in upper case
+		$readallele = uc $readallele;			#lower case indicate the opposite strand
+		$includeinfo or @other = ();			#unless -includeinfo is set, the other will not be printed
+		$snp_quality >= $snpqual or next;		#quality of the variant call
+		$readcount >= $coverage or next;		#read count of the variant call
+		$maxcoverage and $readcount <= $maxcoverage || next;	#maximum coverage of the variant call
+		if ($wt eq '*') {				#indel
+			#example:
+			#1       970271  *       +C/+C   39      106     44      5       +C      *       1       4       0       0       0
+			#1       1548977 *       */+CCG  29      29      42      3       *       +CCG    2       1       0       0       0
+			#1       1674810 *       */+C    24      24      42      6       *       +C      5       1       0       0       0
+			#1       968466  *       -CT/-CT 53      339     55      5       -CT     *       5       0       0       0       0
+			#1       1093600 *       -GAAA/* 29      29      53      3       -GAAA   *       1       2       0       0       0
+			#1       1110101 *       */-A    41      41      17      6       *       -A      5       1       0       0       0
+			#1       1215395 *       */-TC   26      26      32      4       *       -TC     3       1       0       0       0
+			my @obs = split (/\//, $call);		#example: '+AG/+AG' as homozygotes, '*/-TA' or '*/+T' as heterozygotes
+			@obs == 2 or die "Error: pileup record contains more than two alternative alleles: <$_>\n";
+			my ($end, $ref, $alt);
+			my ($indelreadcount);			#number of reads supporting the indel
+			if ($obs[0] eq $obs[1]) {
+				#something weird may occur in SamTools output: 22      15231121        *       */*     360     0       32      158     *       +GA     156     2       0       0       0
+				$obs[0] eq '*' and next;
+				#for deletions, SAMTOOLS represent deletion using a location before the first deleted base in the reference sequence coordinate system
+				#for example, a deletion in Samtools output is "1       109266688       *       */-CTT  1429    1429    58      43      *       -CTT    24      19      0       0       0"
+				#the correct ANNOVAR input (for rs35029887) should be "1       109266689       109266691       CTT     -       het     1429"
+				#insertions are fine without change; for example, check rs5745466 in Genome Browser; samtools report "1       76119508        *       +AT/+AT"
+				#for this insertion, ANNOVAR input file (for rs5745466) becomes "1       76119508        76119508        -       AT      hom     1601"
+				if ($obs[0] =~ m/^\-/) {
+					$pos++;			#add 1 to position in deletion
+				}
+				$indelreadcount = calculateIndelReadCount ($obs[0], \@field);
+				$indelreadcount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
+				defined $altcov and $indelreadcount >= $altcov || next;
+				($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]);
+				print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
+				$counthom++;
+			} else {
+				$hom = 'het';
+				if ($obs[0] =~ m/^[\-\+]/) {
+					$obs[0] =~ m/^\-/ and $pos++;
+					($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[0]);
+					$indelreadcount = calculateIndelReadCount ($obs[0], \@field);
+					$indelreadcount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
+					defined $altcov and $indelreadcount >= $altcov || next;
+					print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
+					$counthet++;
+				}
+				if ($obs[1] =~ m/^[\-\+]/) {
+					$obs[1] =~ m/^\-/ and $pos++;
+					($end, $ref, $alt) = recalculateEndRefObs ($pos, $wt, $obs[1]);
+					$indelreadcount = calculateIndelReadCount ($obs[1], \@field);
+					$indelreadcount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
+					defined $altcov and $indelreadcount >= $altcov || next;
+					print STDOUT join ("\t", $chr, $pos, $end, $ref, $alt, $hom, $snp_quality, $readcount, $indelreadcount, @other), "\n";
+					$counthet++;
+				}
+			}
+			$countindel++;
+		} else {
+			#1       798494  G       A       36      36      58      3       AAA     bbb
+			#1       798677  T       K       33      33      52      26      ,$.,,G.GG,.,......,..G,,...     b`bbBaJIbFbZWaTNQbb_VZcbbb
+			#1       856182  G       A       42      42      50      5       AAAAA   B\bbb
+			#1       861034  A       M       48      48      49      14      ,$,.,..,cc.c.,c bbBbb`]BFbHbBB
+			#1       864289  T       K       22      22      56      6       .g,,g,  BH^_BB
+			$wt eq $call and next;			#this is not a SNP
+			my $obs = $iupac{$call} or die "Error: invalid best call ($call) in <$_>\n";
+			my @obs = split (//, $obs);
+			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
+			if ($obs[0] ne $obs[1]) {
+				$hom = 'het';
+			}
+			if ($obs[0] eq $wt) {			#obs[0] is guaranteed to be an alternative allele
+				@obs = @obs[1,0];
+			}
+			if ($wt eq 'A' and $obs[0] eq 'G' or $wt eq 'G' and $obs[0] eq 'A' or $wt eq 'C' and $obs[0] eq 'T' or $wt eq 'T' and $obs[0] eq 'C') {
+				unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) {
+					$countti++;
+				}
+			} else {
+				unless ($wt ne $obs[0] and $wt ne $obs[1] and $obs[0] ne $obs[1]) {
+					$counttv++;
+				}
+			}
+			my $mutallelecount;
+			if ($obs[1] eq $wt) {			#het SNP
+				if ($chr eq $chrmt) {
+					$hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
+				}
+				$mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
+				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
+				defined $altcov and $mutallelecount >= $altcov || next;
+				print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
+				$counthet++;
+			} elsif ($obs[1] ne $obs[0]) {		#het SNP but both differ from reference allele
+				if ($chr eq $chrmt) {
+					$hom = calculateAllelicFraction ($obs[1], $field[8], $readcount);
+				}
+				$mutallelecount = calculateMutAlleleCount ($obs[1], $readallele);
+				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
+				defined $altcov and $mutallelecount >= $altcov || next;
+				print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[1], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
+				if ($chr eq $chrmt) {
+					$hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
+				}
+				$mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
+				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
+				defined $altcov and $mutallelecount >= $altcov || next;
+				print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
+				$counthet++;
+				$counthet++;
+			} else {				#homo SNP
+				if ($chr eq $chrmt) {
+					$hom = calculateAllelicFraction ($obs[0], $field[8], $readcount);
+				}
+				$mutallelecount = calculateMutAlleleCount ($obs[0], $readallele);
+				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
+				defined $altcov and $mutallelecount >= $altcov || next;
+				print STDOUT join ("\t", $chr, $pos, $pos, $wt, $obs[0], $hom, $snp_quality, $readcount, $mutallelecount, @other), "\n";
+				$counthom++;
+			}
+			$countsnp++;
+		}
+		$countvar++;
+	}
+	my $triallelic = $countsnp-$countti-$counttv;
+	print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n";
+	print STDERR "NOTICE: Among ${\($counthet+$counthom)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n";
+	print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n";
+}
+sub calculateIndelReadCount {
+	my ($obs, $field) = @_;
+	#make sure to use upper case in the comparison, for example:
+	#chr10   130133  *       */-ca   189     189     59      31      *       -ca     27      4       0       0       0
+	if ($obs eq uc $field->[8]) {
+		return $field->[10];
+	} elsif ($obs eq uc $field->[9]) {
+		return $field->[11];
+	} else {
+		die "Error: invalid record in pileup file (indel counts cannot be inferred): <$obs> vs <@$field>\n";
+	}
+}
+sub calculateMutAlleleCount {
+	my ($allele, $string) = @_;	#they should be already both in upper case
+	$string =~ s/\^.//g;		#^ is followed by mapping quality
+	$string =~ s/\$//g;
+	$string =~ s/[+-]1[^\d]//g;	#1 followed by a non-number
+	$string =~ s/[+-]2..//g;
+	$string =~ s/[+-]3...//g;
+	$string =~ s/[+-]4....//g;
+	$string =~ s/[+-]5.....//g;
+	$string =~ s/[+-]6......//g;
+	$string =~ s/[+-]7.......//g;
+	$string =~ s/[+-]8........//g;
+	$string =~ s/[+-]9.........//g;
+	$string =~ s/[+-]10..........//g;
+	#make sure to use upper case letter
+	my @string = split (//, uc $string);
+	my $count = 0;
+	for my $i (0 .. @string-1) {
+		$allele eq $string[$i] and $count++;
+	}
+	return $count;
+}
+sub calculateAllelicFraction {
+	my ($obs, $readbase, $readcount) = @_;
+	my @readbase = split (//, $readbase);
+	my $count=0;
+	for my $i (0 .. @readbase-1) {
+		uc $obs eq uc $readbase[$i] and $count++;
+	}
+	my $hom = $count/$readcount;
+	length ($hom) > 5 and $hom > 0.001 and $hom = sprintf ("%.3f", $hom);
+	return $hom;
+}
+sub recalculateEndRefObs {		#recalculate end position, reference allele and observed allele
+	my ($end, $ref, $obs) = @_;
+	if ($obs =~ m/^\-(\w+)/) {	#deletion
+		$end += (length ($1)-1);
+		$ref = $1;
+		$obs = '-';
+	} elsif ($obs =~ m/^\+(\w+)/) {	#insertion
+		$ref = '-';
+		$obs = $1;
+	} else {
+		die "Error: cannot handle $end, $ref, $obs\n";
+	}
+	return ($end, $ref, $obs);
+}
+sub convertCG {
+	my ($variantfile) = @_;
+	my ($foundheader, $countline, @field);
+	my ($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/;
+	open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
+	print STDERR "NOTICE: Converting variants from $variantfile\n";
+	while (<VAR>) {
+		s/[\r\n]+$//;
+		$countline++;
+		if (m/^>locus/) {
+			$foundheader++;
+		}
+		if (not $foundheader) {
+			$countline > 50 and die "Error: invalid CG-var file format for $variantfile (>locus record is not found within the first 50 lines)\n";
+			next;
+		}
+		my ($locus, $ploidy, $haplo, $chr, $start, $end, $vartype, $ref, $obs, $score, $haplolink, $xref) = split (/\t/, $_);
+		$chr =~ s/^chr//;
+		$vartype eq 'ins' or $start++;		#CG use zero-start, half-open coordinate. Insertion does not need to be processed (example, "16476   2       2       chr1    751820  751820  ins             T       49              dbsnp:rs59038458")
+		$obs eq '' and $obs = '-';
+		$ref eq '' and $ref = '-';
+		if ($vartype =~ m/^snp|ins|del|delins|sub$/) {		#in new versions of the files, "sub" is used instead of "delins".
+			#$chr eq 'M' and next;			#ignore chrM markers as they are not diploid
+			if ($chr eq $prechr and $start eq $prestart and $end eq $preend and $obs eq $preobs) {		#homozygous mutation
+				print $chr, "\t", $start, "\t", $end, "\t", $ref, "\t", $obs, "\t", $vartype, "\t", ($score+$prescore)/2, "\t", "hom\t", $xref, "\n";
+				($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = qw/0 0 0 0 0 0 0 0/;
+			} else {
+				if ($prestart and $preend) {
+					print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n";
+				}
+				($prechr, $prestart, $preend, $prevartype, $preref, $preobs, $prescore, $prexref) = ($chr, $start, $end, $vartype, $ref, $obs, $score, $xref);
+			}
+		}
+	}
+	if ($prestart and $preend) {
+		print $prechr, "\t", $prestart, "\t", $preend, "\t", $preref, "\t", $preobs, "\t", $prevartype, "\t", $prescore, "\thet\t", $prexref, "\n";
+	}
+	print STDERR "NOTICE: Done with $countline lines\n";
+}
+sub convertGFF3SolidSNP {
+	my ($variantfile) = @_;
+	my ($countline, $countvar, $countallvar, @other) = (0, 0, 0);
+	my ($unknown_count);		#count of variants with 'unknown' variation type
+	open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
+	$_ = <VAR>;
+	s/[\r\n]+$//;
+	m/^##gff-version\s+3/ or die "Error: invalid first line in GFF3 file ('##gff-version 3' expected): <$_>\n";
+	$_ = <VAR>;
+	s/[\r\n]+$//;
+	m/^##solid-gff-version/ or print STDERR "WARNING: problematic second line in GFF3-SOLiD file ('##solid-gff-version' expected): <$_>\n";
+	print STDERR "NOTICE: Column 6-9 in output are heterozygosity status, variant score (P-value), total clipped normal coverage reads, total reads with mutated allele\n";
+	while (<VAR>) {
+		s/[\r\n]+$//;
+		$countline++;
+		m/^##/ and next;		#header of comment lines
+		m/^#/ and next;			#header of results lines
+		my @field = split (/\t/, $_);
+		@field == 9 or die "Error: invalid record found in $variantfile (10 fields expected): <$_>\n";
+		my ($chr, $program, $type, $pos, $end, $score, $attribute) = @field[0,1,2,3,4,5,8];		#score is the P-value for the SNP calls
+		$chr eq 'chr_name' and next;	#header line
+		if ($score ne '.') {
+			$score >=0 and $score <=1 or die "Error: invalid score record found in file (0-1 range expected): <$_>\n";
+			$score <= $snppvalue or next;
+		}
+		if ($species and $species eq 'human') {
+			$chr eq '23' and $chr = 'X';
+			$chr eq '24' and $chr = 'Y';
+			$chr eq '25' and $chr = 'M';
+		}
+		$includeinfo and @other = ($attribute);			#unless -includeinfo is set, the other will not be printed
+		my ($readcount, $mutallelecount) = ('.', '.');		#total coverage, coverage for mutated alleles
+		if ($type eq 'unknown') {
+			#SOLiD GDD3 may have unknown variation types
+			#chr1    AB_SOLiD Small Indel Tool       unknown 3833062 3833153 1       .       .       ID=5483;len=no_call;allele-call-pos=3833062;allele-call=/CCAC;allele-pos=3833057;alleles=atccatccacccatc/aTCCATCCACCCACCCATC/NO_CALL;allele-counts=REF,2,2;tight_chrom_pos=none;loose_chrom_pos=3833058-3833069;no_nonred_reads=3;coverage_ratio=8.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_50_10_r,L1_1_50_15_r,L1_1_50_15_r,L1_1_50_12_r;bead_ids=1018_196_970,699_1263_465,220_513_923,2022_1532_1071;overall_qvs=4,6,2,50;no_mismatches=5,4,2,0;read_pos=27,29,31,13;from_end_pos=23,21,19,37;strands=+,+,+,+;tags=R3,F3,F3,F3;indel_sizes=-92,-112,4,4;non_indel_no_mismatches=0,0,8,0;unmatched-lengths=50,50,50,50;ave-unmatched=50.0000;anchor-match-lengths=48,49,49,49;ave-anchor-length=48.7500;read_seqs=G23223321322112233223100132013201320110011001322332,T33223321322112233223100132013201320110013021322332,T33223321322112233223100132013201320110011001322332,T31001320132013201100110013223322113030332233113032;base_qvs=;non_indel_seqs=T21322332211221121322332230321212121223322332233221,G12020202202020012001200213022002130012332310122030,G12020202202020012001000210022012110312331331122030,G22111012101031010100002002321020002202121121313021;non_indel_qvs=
+			$unknown_count++;
+			next;		#do not count this one!
+		}
+		if ($program eq 'SOLiD_diBayes' or $program eq 'AB_SOLiD SNP caller') {		#SNP variants
+			#detailed description can be found at http://solidsoftwaretools.com/gf/download/docmanfileversion/208/866/DiBayes_Documentation_v1.2.pdf
+			#chr1    SOLiD_diBayes   SNP     559817  559817  0.094413        .       .       genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag=
+			#chr1    SOLiD_diBayes   SNP     714068  714068  0.000000        .       .       genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag=
+			#chr1    SOLiD_diBayes   SNP     714835  714835  0.041579        .       .       genotype=R;reference=A;coverage=5;refAlleleCounts=3;refAlleleStarts=3;refAlleleMeanQV=18;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=20;diColor1=02;diColor2=20;het=1;flag=
+			$pos == $end or die "Error: invalid record found in GFF3-SOLiD file: start and end discordant: <$_>\n";
+			my ($wt, $call);
+			if ($attribute =~ m/ref_base=(\w)/) {
+				$wt = $1;
+			} elsif ($attribute =~ m/reference=(\w)/) {
+				$wt = $1;
+			} else {
+				die "Error: invalid record found in GFF3-SOLiD file (ref_base/reference was not found): <$_>\n";
+			}
+			if ($attribute =~ m/consen_base=(\w)/) {
+				$call = $1;
+			} elsif ($attribute =~ m/genotype=(\w)/) {
+				$call = $1;
+			} else {
+				die "Error: invalid record found in GFF3-SOLiD file (consen_base was not found): <$_>\n";
+			}
+			if ($attribute =~ m/coverage=(\d+)/) {
+				$readcount = $1;
+				$readcount >= $coverage or next;		#read count of the variant call
+				$maxcoverage and $readcount <= $maxcoverage || next;
+			}
+			if ($attribute =~ m/novelAlleleCounts=(\d+)/) {
+				$mutallelecount = $1;
+				$mutallelecount/$readcount >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
+				defined $altcov and $mutallelecount >= $altcov || next;
+			}
+			my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
+			my @obs = split (//, $obs);
+			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
+			if ($obs[0] eq $wt and $obs[1] eq $wt) {
+				die "Error: reference alleles are identical to observed alleles: <$_>\n";
+			} elsif ($obs[0] eq $wt) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
+			} elsif ($obs[1] eq $wt) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
+			} elsif ($obs[1] ne $obs[0]) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
+				$countallvar++;
+			} else {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
+			}
+		} elsif ($program eq 'AB_CNV_PIPELINE') {	#CNV
+			if ($attribute =~ m/copynum=(\d+)/ or $attribute =~ m/copynumber=(\d+)/) {
+				if ($1 < 2) {
+					print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
+				} elsif ($1 > 2) {
+					print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
+				}
+			} else {
+				print $chr, "\t", $end, "\t", $end, "\t", '-', "\t", 0, "\t", "unk\t", "$score\t.\t.\t", join ("\t", @other), "\n";
+			}
+		} elsif ($program eq 'AB_SOLiD Large Indel Tool') {	#CNV
+			#http://solidsoftwaretools.com/gf/download/docmanfileversion/182/780/Large_Indel_Documentation_v1.0.0.pdf
+			## [FIELDS] (1) chromosome (2) version (3) indel type (4) breakpoint start (5) breakpoint end (6) p-value (7) NA (8) NA (9) attributes
+			#chr10   AB_SOLiD Large Indel Tool       insertion       151910  151969  2.77548e-11     .       .       dev=-71;avgDev=-1.63884;zygosity=HOMOZYGOUS;nRef=0;nDev=14;refDev=0;devDev=-1.60924;refVar=0;devVar=0.0159438;beadIds=1750_720_1641,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279,
+			#chr10   AB_SOLiD Large Indel Tool       insertion       154109  154729  2.1559e-11      .       .       dev=-66;avgDev=-1.51253;zygosity=HOMOZYGOUS;nRef=0;nDev=15;refDev=0;devDev=-1.02864;refVar=0;devVar=0.133236;beadIds=1728_1671_1739,1250_231_25,811_783_1090,1035_908_491,649_1680_794,503_1756_608,1726_174_1362,1208_1772_353,872_594_1604,1924_990_858,1899_961_1848,901_1226_378,323_1750_1017,1185_180_1237,1519_490_1074,1291_94_324,285_758_922,1135_95_1594,1055_218_1279,
+			my ($call, @call, $zygosity);
+			if ($attribute =~ m#zygosity=HEMIZYGOUS#) {
+				$zygosity = 'het';
+			} elsif ($attribute =~ m#zygosity=HOMOZYGOUS#) {
+				$zygosity = 'hom';
+			} else {
+				$zygosity = 'unk';
+			}
+			if ($type eq 'insertion') {
+				#the true boundary is unknown (start is different from end) so we cannot use "-" to represent reference allele.
+				print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", 0, "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n";
+			} elsif ($type eq 'deletion') {
+				print $chr, "\t", $pos, "\t", $end, "\t", 0, "\t", '-', "\t", $zygosity, "\t", "$score\t.\t.\t", join ("\t", @other), "\n";
+			}
+		} elsif ($program eq 'AB_SOLiD Small Indel Tool') {		#small indels
+			#typical simple insertion and deletions
+			#chr1    AB_SOLiD Small Indel Tool       deletion        1352612 1352614 1       .       .       ID=1290;del_len=3;allele-call-pos=1352612;allele-call=cct/;allele-pos=1352610;alleles=cccctccat/cCCCAT;allele-counts=REF,2;tight_chrom_pos=1352612-1352614;loose_chrom_pos=1352612-1352614;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_3_r,L1_1_25_8_r;bead_ids=1470_2000_506,822_1710_1767;overall_qvs=18,19;no_mismatches=3,3;read_pos=6,13;from_end_pos=19,12;strands=-,+;tags=R3,R3;indel_sizes=-3,-3;non_indel_no_mismatches=1,-1;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=24,99;ave-anchor-length=61.5000;read_seqs=G0112310001100003120031200,G0300213000011000132110021;base_qvs=;non_indel_seqs=T2120033002022200220000002,;non_indel_qvs=
+			#chr1    AB_SOLiD Small Indel Tool       insertion_site  1311162 1311162 1       .       .       ID=1249;ins_len=1;allele-call-pos=1311162;allele-call=/G;allele-pos=1311161;alleles=gaggggggg/GAGGGGGGGG/NO_CALL;allele-counts=REF,3,1;tight_chrom_pos=none;loose_chrom_pos=1311160-1311169;no_nonred_reads=3;coverage_ratio=4.6667;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_50_10_r,L1_1_25_2_r,L1_1_25_3_r;bead_ids=850_837_429,1160_878_181,404_1050_1881,1084_64_1343;overall_qvs=20,56,25,25;no_mismatches=3,2,2,1;read_pos=11,22,11,11;from_end_pos=14,28,14,14;strands=+,-,-,-;tags=R3,F3,F3,F3;indel_sizes=1,1,1,1;non_indel_no_mismatches=-1,1,0,1;unmatched-lengths=25,50,25,25;ave-unmatched=31.2500;anchor-match-lengths=99,49,24,24;ave-anchor-length=49.0000;read_seqs=G1020001130221020000000020,T03223323210110021000000022122030100020221222222122,T0102210000000221223301000,T0102210000000221220301000;base_qvs=;non_indel_seqs=,G21202030032202013220021321131212021000122300013132,G1331133120001221220120120,G1331133120001221220120220;non_indel_qvs=
+			#sometimes, allele-call is ambiguous that requires a "block substitution" representation (although they were annotated as insertion or deletion by SOLiD, they should be treated as block substitution by ANNOVAR)
+			#sometimes, mutiple allele calls may be present at the same site
+			#chr1    AB_SOLiD Small Indel Tool       deletion        1209357 1209360 1       .       .       ID=1101;del_len=4;allele-call-pos=1209357;allele-call=ggtggg/TT;allele-pos=1209355;alleles=ggggtgggggggtt/gGTTGGGGTT/gGTGTTTTGCCTT/NO_CALL;allele-counts=REF,3,1,1;tight_chrom_pos=none;loose_chrom_pos=1209357-1209363;no_nonred_reads=4;coverage_ratio=3.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=0.9888;run_names=L1_1_25_1_r,L1_1_25_2_r,L1_1_25_4_r,L1_1_25_3_r,L1_1_25_7_r;bead_ids=1017_1024_53,1493_1896_615,1794_647_1473,307_1116_687,773_1492_1671;overall_qvs=24,24,28,24,8;no_mismatches=2,3,2,3,2;read_pos=14,9,14,9,15;from_end_pos=11,16,11,16,10;strands=-,+,-,+,+;tags=F3,R3,F3,F3,F3;indel_sizes=-4,-4,-4,-4,3;non_indel_no_mismatches=1,0,0,0,0;unmatched-lengths=25,25,25,25,25;ave-unmatched=25.0000;anchor-match-lengths=24,24,24,24,24;ave-anchor-length=24.0000;read_seqs=T2221100101000101000221100,G0001122000100000101001020,T2221100101000101000221100,T1112200010100010100112000,T1011220000111000130200001;base_qvs=;non_indel_seqs=G0312033221312111022200300,T0111113210210112100001130,G0312133221312111022200300,G0231003132222112000012020,G3121331033101113122312020;non_indel_qvs=
+			#chr1    AB_SOLiD Small Indel Tool       deletion        1209436 1209436 1       .       .       ID=1103;del_len=1;allele-call-pos=1209436;allele-call=ag/A/G;allele-pos=1209434;alleles=tgagggggtt/tGAGGGGTT/tGGGGGGTT;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1209436-1209441;no_nonred_reads=2;coverage_ratio=5.0000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_6_r,L1_1_25_2_r;bead_ids=1315_1584_2005,1706_194_437;overall_qvs=28,21;no_mismatches=0,3;read_pos=9,7;from_end_pos=16,18;strands=-,-;tags=R3,R3;indel_sizes=-1,-1;non_indel_no_mismatches=-1,0;unmatched-lengths=25,25;ave-unmatched=25.0000;anchor-match-lengths=99,24;ave-anchor-length=61.5000;read_seqs=G3001010000011001010000001,G3010100022110010111000110;base_qvs=;non_indel_seqs=,T1112003220020013202122300;non_indel_qvs=
+			#chr1    AB_SOLiD Small Indel Tool       insertion_site  1424540 1424540 1       .       .       ID=1376;ins_len=3;allele-call-pos=1424540;allele-call=tt/CCCAC;allele-pos=1424537;alleles=ttttttg/TTTCCCACTG/NO_CALL;allele-counts=REF,1,1;tight_chrom_pos=none;loose_chrom_pos=1424536-1424543;no_nonred_reads=2;coverage_ratio=11.5000;experimental-zygosity=HEMIZYGOUS;experimental-zygosity-score=1.0000;run_names=L1_1_25_7_r,L1_1_50_16_r;bead_ids=703_437_370,1089_878_1744;overall_qvs=1,9;no_mismatches=3,4;read_pos=5,35;from_end_pos=20,15;strands=-,-;tags=R3,F3;indel_sizes=3,3;non_indel_no_mismatches=2,0;unmatched-lengths=25,50;ave-unmatched=37.5000;anchor-match-lengths=24,47;ave-anchor-length=35.5000;read_seqs=G2032002200200000000000020,T30100102220312202103112130230322210121100200002100;base_qvs=;non_indel_seqs=T2121120003012303000000000,G22213300221101011121030022002222300220322213303102;non_indel_qvs=
+			my ($call, @call, $zygosity);
+			if ($attribute =~ m#experimental-zygosity=HEMIZYGOUS#) {
+				$zygosity = 'het';
+			} elsif ($attribute =~ m#experimental-zygosity=HOMOZYGOUS#) {
+				$zygosity = 'hom';
+			} else {
+				$zygosity = 'unk';
+			}
+			$score = '.';			#by default, score=1 in the output
+			#no_nonred_reads: Number of reads with unique start positions (non-redundant reads).
+			#coverage_ratio: Clipped normal coverage/number of non-redundant reads.Clipped coverage is where the parts of the read that are within 5 bp at either end are not counted as a part of coverage.
+			if ($attribute =~ m/no_nonred_reads=(\d+);coverage_ratio=([\d\.]+)/) {
+				$readcount = int ($1*$2);
+				$readcount >= $coverage or next;		#clipped normal read count of the variant call (this could even be lower than mut allele count)
+				$maxcoverage and $readcount <= $maxcoverage || next;
+			} else {
+				$readcount = '.';
+			}
+			if ($attribute =~ m/allele-counts=REF,(\d+)/) {
+				$mutallelecount = $1;
+			}
+			if ($attribute =~ m#allele\-call=([\w\/]+)#) {
+				@call = split (/\//, $1);
+				if (@call == 1) {		#a simple deletion
+					print $chr, "\t", $pos, "\t", $end, "\t", $call[0], "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
+				} elsif ($call[0] eq '') {	#a simple insertion (single or multiple allele)
+					for my $i (1 .. @call-1) {
+						print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
+						$i > 1 and $countallvar++;
+					}
+				} else {			#an indel that may have several alleles, or may require a block substitution representation
+					for my $i (1 .. @call-1) {
+						print $chr, "\t", $pos, "\t", $pos+length($call[0])-1, "\t", $call[0], "\t", $call[$i], "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
+						$i > 1 and $countallvar++;
+					}
+				}
+			} else {
+				$call = '0';
+				print $chr, "\t", $pos, "\t", $end, "\t", $call, "\t", '-', "\t", $zygosity, "\t", "$score\t$readcount\t$mutallelecount\t", join ("\t", @other), "\n";
+			}
+		} else {
+			die "Error: unrecognizable genotype calling program encountered (valid types are SOLiD_diBayes, AB_CNV_PIPELINE, AB_SOLiD Large Indel Tool, AB_SOLiD Small Indel Tool): <$_>\n";
+		}
+		$countvar++;		#variation positions
+		$countallvar++;		#all variants (maybe several at one variation position)
+	}
+	print STDERR "NOTICE: Finished processing $variantfile with $countline input lines\n";
+	print STDERR "NOTICE: Wrote variants in $countvar variation positions ($countallvar variants considering multi-allelic ones)\n";
+	$unknown_count and print STDERR "WARNING: $unknown_count variants with 'unknown' variation type were skipped\n";
+}
+sub convertSOAP {
+	my ($variantfile) = @_;
+	my ($countline, $countvar, @other);
+	open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
+	while (<VAR>) {
+		s/[\r\n]+$//;
+		$countline++;
+		my @field = split (/\t/, $_);
+		if (@field == 18) {		#snp file
+			my ($chr, $pos, $wt, $call, @other) = @field;
+			$chr =~ s/^chr//;
+			$includeinfo or @other = ();			#unless -includeinfo is set, the other will not be printed
+			my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
+			my @obs = split (//, $obs);
+			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
+			if ($obs[0] eq $wt and $obs[1] eq $wt) {
+				die "Error: reference alleles are identical to observed alleles: <$_>\n";
+			} elsif ($obs[0] eq $wt) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
+			} elsif ($obs[1] eq $wt) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
+			} elsif ($obs[1] ne $obs[0]) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
+				$countvar++;
+			} else {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
+			}
+		} elsif (@field == 6) {		#indel file
+			my ($chr, $pos, $strand, $indellen, $call, $homo) = @field;
+			$homo eq 'Homo' and $homo = 'hom';
+			$homo eq 'Hete' and $homo = 'het';
+			$chr =~ s/^chr//;
+			$includeinfo or @other = ();			#unless -includeinfo is set, the other will not be printed
+			if ($indellen =~ m/^\+(\d+)$/) {		#insertion
+				length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n";
+				print join("\t", $chr, $pos, $pos, '-', $call, $homo), "\n";
+			} elsif ($indellen =~ m/^\-(\d+)$/) {		#deletion
+				length ($call) == $1 or die "Error: invalid record found in SOAPindel file: <$_>\n";
+				print join("\t", $chr, $pos, $pos+$1-1, $call, '-', $homo), "\n";
+			} else {
+				die "Error: invalid record found in SOAPindel file: <$_>\n";
+			}
+		} else {
+			die "Error: invalid record found in $variantfile (18 or 6 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
+		}
+		$countvar++;
+	}
+	print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
+}
+sub convertANNOVAR {
+	my ($variantfile) = @_;
+	my ($countline, $countvar, $countsnp);
+	my ($countti, $counttv) = (0, 0);
+	open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
+	while (<VAR>) {
+		$countline++;
+		my @field = split (/\t/, $_);
+		my ($chr, $start, $end, $ref, $obs) = @field;
+		if ($ref =~ m/^[ACGT]$/ and $obs =~ m/^[ACGT]$/) {
+			if ($ref eq 'A' and $obs eq 'G' or $ref eq 'G' or $obs eq 'A' or $ref eq 'C' and $obs eq 'T' or $ref eq 'T' and $obs eq 'C') {
+				$countti++;
+			} else {
+				$counttv++;
+			}
+			$countsnp++;
+		}
+		print;
+		$countvar++;
+	}
+	print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
+	print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions\n";
+}
+sub convertMAQSNP {
+	my ($variantfile) = @_;
+	my ($countline, $countvar, @other);
+	open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
+	while (<VAR>) {
+		s/[\r\n]+$//;
+		$countline++;
+		my @field = split (/\t/, $_);
+		if (@field == 12) {					#SNPs
+			my ($chr, $pos, $wt, $call, @other) = @field;
+			$chr =~ s/^chr//;
+			$includeinfo or @other = ();			#unless -includeinfo is set, the other will not be printed
+			my $obs = $iupac{$call} or die "Error: invalid best call in <$_>\n";
+			my @obs = split (//, $obs);
+			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
+			if ($obs[0] eq $wt and $obs[1] eq $wt) {
+				die "Error: reference alleles are identical to observed alleles: <$_>\n";
+			} elsif ($obs[0] eq $wt) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
+			} elsif ($obs[1] eq $wt) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
+			} elsif ($obs[1] ne $obs[0]) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
+				$countvar++;
+			} else {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
+			}
+			$countvar++;
+		} elsif (@field == 13) {				#indels; the deletion start site do not need changes; the duplication start site need additional investigation by ANNOVAR developers
+			my ($chr, $pos, $type, $numread, $call, @other) = @field;
+			$chr =~ s/^chr//;
+			$includeinfo or @other = ();			#unless -includeinfo is set, the other will not be printed
+			my @obs = split (/:/, $call);
+			@obs == 2 or die "Error: observed IUPAC allele $call should correspond to two nucleotide alleles: <$_>\n";
+			if ($obs[0] =~ m/^\-(\d+)/) {
+				my $len = $1;
+				print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[1], "\t", '-', "\t", "het\t", join ("\t", @other), "\n";
+			} elsif ($obs[0] =~ m/^(\d+)/) {
+				my $len = $1;
+				print $chr, "\t", $pos, "\t", $pos, "\t", '-', "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
+			}
+			$countvar++;
+		} else {
+			die "Error: invalid record found in $variantfile (12 or 13 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
+		}
+	}
+	print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
+}
+sub convertCASAVA {
+	my ($variantfile, $chr) = @_;
+	my ($countline, $countvar, @other);
+	my ($intype);
+	my ($pos_index, $call_index, $reference_index, $type_index, $score_index, $total_index, $used_index);
+	my ($ref_indel_index, $quality_index, $maxgtype_index, $bp1_reads_index, $ref_reads_index, $indel_reads_index, $other_reads_index);
+	open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
+	while (<VAR>) {
+		s/[\r\n]+$//;
+		$countline++;
+		my @field;
+		if (m/^#/) {
+			s/^#//;
+			if (s/^\$\sCOLUMNS\s//) {
+				@field = split (/\s+/, $_);
+			} else {
+				@field = split (/\t/, $_);
+			}
+			if (m/\bposition\b/ or m/\bpos\b/) {
+				for my $i (0 .. @field-1) {
+					if ($field[$i] eq 'position' or $field[$i] eq 'pos') {
+						$pos_index = $i;
+					} elsif ($field[$i] eq 'modified_call') {
+						$intype = 'snp';
+						print STDERR "NOTICE: Automatically detected input type as $intype\n";
+						$call_index = $i;
+					} elsif ($field[$i] eq 'reference') {
+						$reference_index = $i;
+					} elsif ($field[$i] eq 'type') {
+						$type_index = $i;
+					} elsif ($field[$i] eq 'score') {
+						$score_index = $i;
+					} elsif ($field[$i] eq 'total') {
+						$total_index = $i;
+					} elsif ($field[$i] eq 'used') {
+						$used_index = $i;
+					} elsif ($field[$i] eq 'ref/indel') {
+						$intype = 'indel';
+						print STDERR "NOTICE: Automatically detected input type as $intype\n";
+						$ref_indel_index = $i;
+					} elsif ($field[$i] eq 'Q(indel)') {
+						$quality_index = $i;
+					} elsif ($field[$i] eq 'max_gtype') {
+						$maxgtype_index = $i;
+					} elsif ($field[$i] eq 'bp1_reads') {
+						$bp1_reads_index = $i;
+					} elsif ($field[$i] eq 'ref_reads') {
+						$ref_reads_index = $i;
+					} elsif ($field[$i] eq 'indel_reads') {
+						$indel_reads_index = $i;
+					} elsif ($field[$i] eq 'other_reads') {
+						$other_reads_index = $i;
+					}
+				}
+			}
+			next;
+		}
+		$intype or die "Error: unable to recognize the correct type of the input file (make sure that header line is present in $variantfile)\n";
+		@field = split (/\t/, $_);
+		if ($intype eq 'snp') {					#SNPs
+			defined $pos_index and defined $reference_index and defined $call_index or die "Error: unalbe to find the position, reference and modified_call column header in $variantfile\n";
+			my ($pos, $wt, $obs) = @field[$pos_index, $reference_index, $call_index];
+			my (@other);
+			defined $pos and defined $wt and defined $obs or die;
+			if ($includeinfo) {
+				defined $score_index and push @other, $field[$score_index];
+				defined $total_index and push @other, $field[$total_index];
+				defined $used_index and push @other, $field[$used_index];
+				defined $type_index and push @other, $field[$type_index];
+			}
+			length ($obs) == 1 and $obs .= $obs;
+			my @obs = split (//, $obs);
+			@obs == 2 or die "Error: observed allele $obs should correspond to two nucleotide alleles: <$_>\n";
+			if ($obs[0] eq $wt and $obs[1] eq $wt) {
+				die "Error: reference alleles are identical to observed alleles: <$_>\n";
+			} elsif ($obs[0] eq $wt) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
+			} elsif ($obs[1] eq $wt) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
+			} elsif ($obs[1] ne $obs[0]) {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "het\t", join ("\t", @other), "\n";
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[1], "\t", "het\t", join ("\t", @other), "\n";
+				$countvar++;
+			} else {
+				print $chr, "\t", $pos, "\t", $pos, "\t", $wt, "\t", $obs[0], "\t", "hom\t", join ("\t", @other), "\n";
+			}
+			$countvar++;
+		} elsif ($intype eq 'indel') {				#indels
+			defined $pos_index and defined $ref_indel_index and defined $maxgtype_index or die "Error: unable to find the pos, ref_indel and max_gtype column header in $variantfile\n";
+			my ($pos, $call, $hom, @other) = @field[$pos_index, $ref_indel_index, $maxgtype_index];
+			if ($includeinfo) {
+				defined $quality_index and push @other, $field[$quality_index];
+				defined $maxgtype_index and push @other, $field[$maxgtype_index];
+				defined $bp1_reads_index and push @other, $field[$bp1_reads_index];
+				defined $ref_reads_index and push @other, $field[$ref_reads_index];
+				defined $indel_reads_index and push @other, $field[$indel_reads_index];
+				defined $other_reads_index and push @other, $field[$other_reads_index];
+			}
+			#hg19 coordinate below; insertion needs position adjustment!!! deletion is fine
+			#948847  1I      CCTCAGGCTT      -/A     ATAATAGGGC      969     hom     47      het     22      0       16      6       A       1       2
+			#978604  2D      CACTGAGCCC      CT/--   GTGTCCTTCC      251     hom     20      het     8       0       4       4       CT      1       0
+			#1276974 4I      CCTCATGCAG      ----/ACAC       ACACATGCAC      838     hom     39      het     18      0       14      4       AC      2       4
+			#1289368 2D      AGCCCGGGAC      TG/--   GGAGCCGCGC      1376    hom     83      het     33      0       25      9       TG      1       0
+			#185137455     11I10M2I        TATGTGTCCT      -----------TTTTTTATTT--/AAATGATAGACTTTTTTTTTTAA ATTTCAGAAA      1126    het     988     hom    45       20      24      7       N/A     0       0
+			#1276931 2D41M4I CACACACATG      CACACACACGCACACACGTGCAATGTGAAAACACCTCATGCAG----/--CACACACGCACACACGTGCAATGTGAAAACACCTCATGCAGACAC ACACATGCAC      548     hom     16      het     8       0       11      11      N/A     0       0
+			my @obs = split (/\//, $call);
+			@obs == 2 or die "Error: observed indel allele $call should correspond to two alleles: <$_>\n";
+			if ($obs[0] =~ m/^\-+$/) {		#insertion
+				my $len = length ($obs[0]);
+				print $chr, "\t", $pos-1, "\t", $pos-1, "\t", '-', "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
+			} elsif ($obs[1] =~ m/^\-+$/) {		#deletion
+				my $len = length ($obs[0]);
+				print $chr, "\t", $pos, "\t", $pos+$len-1, "\t", $obs[0], "\t", '-', "\t", $hom, "\t", join ("\t", @other), "\n";
+			} elsif (length ($obs[0]) eq length ($obs[1])) {	#block substitution
+				$obs[0] =~ s/\-//g;
+				$obs[1] =~ s/\-//g;
+				print $chr, "\t", $pos, "\t", $pos+length($obs[0])-1, "\t", $obs[0], "\t", $obs[1], "\t", $hom, "\t", join ("\t", @other), "\n";
+			} else {
+				die "Error: invalid record found in indel line: <$_>\n";
+			}
+			$countvar++;
+		} else {
+			die "Error: invalid record found in $variantfile (11 or 15 fields expected, observed ${\(scalar @field)} fields): <$_>\n";
+		}
+	}
+	print STDERR "NOTICE: Read $countline lines and wrote $countvar variants\n";
+}
+sub convertVCF4 {
+	my ($variantfile) = @_;
+	my ($countline, $countvar, $counthom, $counthet, $countunknown, $countindel, $countsnp, $countti, $counttv) = qw/0 0 0 0 0 0 0 0 0/;
+	my ($source_program);		#the program that generated the VCF4 file
+	open (VAR, $variantfile) or die "Error: cannot read from variant file $variantfile: $!\n";
+	while (<VAR>) {
+		$countline++;
+		if (m/^##fileformat=VCFv(\d+\.)/) {
+			$1<4 and print STDERR "ERROR: Input file is not in VCF version 4 format but is $_" and exit;
+		}
+		if (m/^##UnifiedGenotyper/) {
+			$source_program = 'gatksnp';
+			print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK UnifiedGenotyper\n";
+			print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, RMS mapping quality, quality by depth\n";
+			$fraction and print STDERR "WARNING: the --fraction argument will be ignored for GATK SNP calls!!!\n";
+			$confraction and print STDERR "WARNING: the --confraction argument will be ignored for GATK SNP calls!!!\n";
+		}
+		if (m/^##IndelGenotyper/) {
+			$source_program = 'gatkindel';
+			print STDERR "NOTICE: Detected that the VCF4 file is generated by GATK IndelGenotyper\n";
+			print STDERR "NOTICE: column 6-10 represent heterozygosity status, quality score, read depth, read count supporting indel call, RMS mapping quality\n";
+		}
+		m/^#/ and next;		#skip comment lines
+		s/[\r\n]+$//;		#delete trailing new lines
+		my $otherinfo = $_;	#this is the complete line (when -includeinfo is set, the entire line will be included in output file)
+		#format description: http://www.1000genomes.org/wiki/Analysis/vcf4.0
+		#standard VCF4 should have 8 columns, but some software may produce more columns (for example, for genotype calls). The first 8 columns should follow the specification
+		#example of VCF4 generated by GATK SNP caller
+		#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  SAMPLE
+		#1       55      .       T       G       34.82   .       DP=2;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=14.16;MQ0=0;QD=17.41;SB=-10.00     GT:DP:GL:GQ     0/1:1:-6.66,-0.30,-0.00:1.76
+		#1       2646    .       G       A       40.91   .       DP=4;Dels=0.00;HRun=0;HaplotypeScore=0.00;MQ=7.50;MQ0=3;QD=10.23;SB=-10.00      GT:DP:GL:GQ     0/1:1:-7.27,-0.30,-0.00:1.76
+		#example of VCF4 generated by GATK indel caller
+		#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  SAMPLE
+		#1       2525324 .       G       GC      .       PASS    AC=5,5;DP=12;MM=4.8,3.7142856;MQ=29.0,42.285713;NQSBQ=33.0,46.463768;NQSMM=0.24,0.20289855;SC=0,5,1,6  GT       0/1
+		#1       3553372 .       GC      G       .       PASS    AC=6,6;DP=6;MM=0.8333333,0.0;MQ=60.0,0.0;NQSBQ=63.533333,0.0;NQSMM=0.0,0.0;SC=0,6,0,0   GT      1/0
+		#1       6093011 .       CG      C       .       PASS    AC=31,31;DP=32;MM=0.7096774,2.0;MQ=59.64516,60.0;NQSBQ=64.192184,39.666668;NQSMM=0.0,0.11111111;SC=23,8,0,1     GT      1/0
+		#example of VCF4 generated by 1000G
+		#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
+		#1       533     .       G       C       .       PASS    AA=.;AC=6;AN=120;DP=423
+		#1       41342   .       T       A       .       PASS    AA=.;AC=29;AN=120;DP=188
+		#1       41791   .       G       A       .       PASS    AA=.;AC=5;AN=120;DP=192
+		#1       44449   .       T       C       .       PASS    AA=C;AC=2;AN=120;DP=166
+		#1       44539   rs2462492       C       T       .       PASS    AA=T;AC=2;AN=120;DP=131
+		#example of VCF4 generated by 1000G
+		#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
+		#1       1000153 .       TCACA   T       100     PASS    AF=0.115095;HP=1;NF=16;NR=13;NS=52;CA=0;DP=615
+		#1       1000906 .       CA      C       48      PASS    AF=0.0772696;HP=1;NF=2;NR=9;NS=51;CA=0;DP=281
+		#1       1000950 rs60561655;-/G  CG      C       100     PASS    AF=0.447771;HP=5;DB;NF=10;NR=20;NS=50;CA=M;DP=291
+		#1       1010786 rs36095298;-/G,mills,venter     A       AG      100     PASS    AF=0.774334;HP=1;DB;NF=21;NR=27;NS=51;CA=0;DP=306
+		#1       1026158 .       T       TGGGGG  100     PASS    AF=0.115637;HP=1;NF=5;NR=2;NS=52;CA=0;DP=591
+		#reserved VCF4 sub-fields in the INFO field
+		#    * AA ancestral allele
+		#    * AC allele count in genotypes, for each ALT allele, in the same order as listed
+		#    * AF allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes
+		#    * AN total number of alleles in called genotypes
+		#    * BQ RMS base quality at this position
+		#    * CIGAR cigar string describing how to align an alternate allele to the reference allele
+		#    * DB dbSNP membership
+		#    * DP combined depth across samples, e.g. DP=154
+		#    * END end position of the variant described in this record (esp. for CNVs)
+		#    * H2 membership in hapmap2
+		#    * MQ RMS mapping quality, e.g. MQ=52
+		#    * MQ0 Number of MAPQ == 0 reads covering this record
+		#    * NS Number of samples with data
+		#    * SB strand bias at this position
+		#    * SOMATIC indicates that the record is a somatic mutation, for cancer genomics
+		#    * VALIDATED validated by follow-up experiment
+		#SAMtools/BCFtools specific information
+		#SAMtools/BCFtools may write the following tags in the INFO field in VCF/BCF.
+		#Tag	Description
+		#I16	16 integers:
+		#1	#reference Q13 bases on the forward strand 	2	#reference Q13 bases on the reverse strand
+		#3	#non-ref Q13 bases on the forward strand 	4	#non-ref Q13 bases on the reverse strand
+		#5	sum of reference base qualities 	6	sum of squares of reference base qualities
+		#7	sum of non-ref base qualities 	8	sum of squares of non-ref base qualities
+		#9	sum of ref mapping qualities 	10	sum of squares of ref mapping qualities
+		#11	sum of non-ref mapping qualities 	12	sum of squares of non-ref mapping qualities
+		#13	sum of tail distance for ref bases 	14	sum of squares of tail distance for ref bases
+		#15	sum of tail distance for non-ref bases 	16	sum of squares of tail distance for non-ref
+		#INDEL	Indicating the variant is an INDEL.
+		#DP	The number of reads covering or bridging POS.
+		#DP4	Number of 1) forward ref alleles; 2) reverse ref; 3) forward non-ref; 4) reverse non-ref alleles, used in variant calling. Sum can be smaller than DP because low-quality bases are not counted.
+		#PV4	P-values for 1) strand bias (exact test); 2) baseQ bias (t-test); 3) mapQ bias (t); 4) tail distance bias (t)
+		#FQ	Consensus quality. If positive, FQ equals the phred-scaled probability of there being two or more different alleles. If negative, FQ equals the minus phred-scaled probability of all chromosomes being identical. Notably, given one sample, FQ is positive at hets and negative at homs.
+		#AF1	EM estimate of the site allele frequency of the strongest non-reference allele.
+		#CI95	Equal-tail (Bayesian) credible interval of the site allele frequency at the 95% level.
+		#PC2	Phred-scaled probability of the alternate allele frequency of group1 samples being larger (,smaller) than of group2 samples.
+		#PCHI2	Posterior weighted chi^2 P-value between group1 and group2 samples. This P-value is conservative.
+		#QCHI2	Phred-scaled PCHI2
+		#RP	Number of permutations yeilding a smaller PCHI2
+		#example of triallelic variants generated by mpileup/bcftools
+		#1       156706559       .       A       C,G     114     .	DP=20;AF1=1;CI95=1,1;DP4=0,0,1,19;MQ=60;FQ=-63  GT:PL:GQ	1/2:237,126,90,162,0,138:99
+		#6       31129642        .       A       G,C     76      .	DP=31;AF1=1;CI95=1,1;DP4=0,0,28,3;MQ=60;FQ=-75  GT:PL:GQ	1/2:255,194,146,164,0,119:99
+		#1       11297762        .       T       C,A     98      .	DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78  GT:PL:GQ	1/1:131,51,0,120,28,117:99
+		my @field=split(/\t/,$_);
+		@field >=8 or die "Error: invalid record found in VCF4 file (at least 8 tab-delimited fields expected): <$_>\n";
+		my ($chr, $start, $ID, $ref_allele, $mut_allele, $quality_score, $filter, $info, $format, $sample) = @field;
+		my ($end);
+		my ($mut_allele2, $zygosity);
+		if ($filterword) {		#ensure that the filter field contains the filterword
+			$filter =~ m/\b$filterword\b/i or next;
+		}
+		#sometimes the alleles are not in the same case
+		#chr1    1869771 1869774 actc    aCTctc          43.5    13      INDEL;DP=13;AF1=0.5;CI95=0.5,0.5;DP4=0,4,4,0;MQ=37;PV4=0.029,0.45,1,0.46
+		$ref_allele = uc $ref_allele;
+		$mut_allele = uc $mut_allele;
+		#if ($ID eq '.' || $ID =~ /^rs/) {		#per MISHIMA, Hiroyuki suggestion (vcf4's third column (ID column) are not always ".")
+		#	$end = $start;				#this block is commented out on 2011feb19
+		#}
+		if ($mut_allele eq '.') {			#no variant call was made at this position
+			next;
+		}
+		if ($mut_allele =~ m/([^,]+),([^,]+)/) {
+			$mut_allele = $1;
+			$mut_allele2 = $2;
+		}
+		if(length($ref_allele)==1 && length($mut_allele)==1) {  	### output snv
+			my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/;
+			my ($MappingQuality) = $info =~ /MQ=([^;]+)/;
+			my ($QualityByDepth) = $info =~ /QD=([^;]+)/;
+			if ($coverage) {
+				defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next;
+				if ($maxcoverage) {
+					defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next;
+				}
+			}
+			if ($snpqual) {
+				defined $QualityByDepth and $QualityByDepth >= $snpqual || next;		#the QD was used here as quality score
+			}
+			if (defined $sample) {
+				if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
+					$zygosity = 'het';
+					$counthet++;
+				} elsif ($sample =~ m#^1/1#) {
+					$zygosity = 'hom';
+					$counthom++;
+				} else {
+					$zygosity = 'unknown';
+					$countunknown++;
+				}
+			} else {
+				$zygosity = 'unknown';
+				$countunknown++;
+			}
+			#the subject is called as homozygous for the first alternative allele (genotype 1/1. i.e. C/C), but since there was one read containing A, samtools still keep both alleles in the VCF file (but gives a very low probabilities for it).
+			#1       11297762        .       T       C,A     98      . DP=19;AF1=1;CI95=1,1;DP4=0,0,17,1;MQ=60;FQ=-78  GT:PL:GQ 1/1:131,51,0,120,28,117:99
+			if ($mut_allele2 and $zygosity eq 'hom') {
+				$mut_allele2 = '';
+			}
+			if (not $mut_allele2) {
+				if ($ref_allele eq 'A' and $mut_allele eq 'G' or $ref_allele eq 'G' and $mut_allele eq 'A' or $ref_allele eq 'C' and $mut_allele eq 'T' or $ref_allele eq 'T' and $mut_allele eq 'C') {
+					$countti++;
+				} else {
+					$counttv++;
+				}
+			}
+			print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele, "\t$zygosity",  "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n";
+			if ($allallele) {
+				if ($mut_allele2) {
+					print $chr, "\t", $start, "\t", $start, "\t", $ref_allele, "\t", $mut_allele2, "\t$zygosity",  "\t", $quality_score, (defined $unfiltered_read_depth)? "\t$unfiltered_read_depth" : '', (defined $MappingQuality) ? "\t$MappingQuality" : '', (defined $QualityByDepth) ? "\t$QualityByDepth" : '', $includeinfo ? "\t$otherinfo" : '', "\n";
+				}
+			}
+			$countsnp++;
+		} elsif (length($ref_allele) > 1 || length($mut_allele) > 1) {  ### output indel
+			my ($indel_read_depth1, $indel_read_depth2) = $info =~ /AC=([^,;]+),([^,;]+)/;		#number of reads supporting consensus indel, any indel
+			my ($unfiltered_read_depth) = $info =~ /DP=(\d+)/;
+			if ($coverage) {
+				defined $unfiltered_read_depth and $unfiltered_read_depth >= $coverage || next;
+				if ($maxcoverage) {
+					defined $unfiltered_read_depth and $unfiltered_read_depth <= $maxcoverage || next;
+				}
+			}
+			if (defined $indel_read_depth1 and defined $unfiltered_read_depth) {
+				$indel_read_depth1/$unfiltered_read_depth >= $fraction or next;		#do not meet minimum alternative allele fraction threshold
+				$indel_read_depth1/$indel_read_depth2 >= $confraction or next;
+			}
+			my ($MappingQuality) = $info =~ /MQ=([^;]+),/;
+			#example VCF4 records below:
+			#20      2       .       TCG     T       .       PASS    DP=100
+			#Chr1    5473    .       AT      ATT     23.5    .       INDEL;DP=16;AF1=0.5;CI95=0.5,0.5;DP4=4,2,3,1;MQ=42;PV4=1,0.41,0.042,0.24
+			#Chr1    6498    .       ATTTT   ATTTTT  53.5    .       INDEL;DP=9;AF1=1;CI95=1,1;DP4=0,0,5,3;MQ=28
+			if(length($ref_allele) > length ($mut_allele)) { 		# deletion or block substitution
+				my $head = substr($ref_allele, 0, length ($mut_allele));
+				if ($head eq $mut_allele) {
+					print $chr,"\t";
+					print $start+length($head),"\t";
+					print $start+length($ref_allele)-1,"\t";
+					$ref_allele = substr ($ref_allele, length ($mut_allele));
+					print $ref_allele,"\t";
+					print "-";
+				} else {
+					print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t";
+				}
+			} elsif(length($mut_allele) >= length ($ref_allele)) { 		# insertion or block substitution
+				my $head = substr ($mut_allele, 0, length ($ref_allele));
+				if ($head eq $ref_allele) {
+					print $chr,"\t";
+					print $start+length($ref_allele)-1,"\t";
+					print $start+length($ref_allele)-1,"\t";
+					$mut_allele = substr ($mut_allele, length ($ref_allele));
+					print "-\t";
+					print $mut_allele;
+				} else {
+					print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele, "\t";
+				}
+			}
+			if (defined $sample) {
+				if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
+					print "\thet";
+					$counthet++;
+				} elsif ($sample =~ m#^1/1#) {
+					print "\thom";
+					$counthom++;
+				} # BEGIN ARQ
+				elsif ($sample =~ m#^./.#) {
+					print "\tunknown";
+					$countunknown++;
+				} # END ARQ
+			}
+			print "\t", $quality_score;
+			defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth;
+			defined $indel_read_depth1 and print "\t", $indel_read_depth1;
+			defined $MappingQuality and print "\t", $MappingQuality;
+			$includeinfo and print "\t", $otherinfo;
+			print "\n";
+			$countindel++;
+			#do the same thing again, exactly like above, except that we work on second mutation;
+			#in the future, consider rewrite this paragraph to make the code more elegant
+			if ($allallele) {
+				if ($mut_allele2) {
+					if(length($ref_allele) > length ($mut_allele2)) { 		# deletion or block substitution
+						my $head = substr($ref_allele, 0, length ($mut_allele2));
+						if ($head eq $mut_allele2) {
+							print $chr,"\t";
+							print $start+length($head),"\t";
+							print $start+length($ref_allele)-1,"\t";
+							$ref_allele = substr ($ref_allele, length ($mut_allele2));
+							print $ref_allele,"\t";
+							print "-";
+						} else {
+							print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2;
+						}
+					} elsif(length($mut_allele2) > length ($ref_allele)) { 		# insertion or block substitution
+						my $head = substr ($mut_allele2, 0, length ($ref_allele));
+						if ($head eq $ref_allele) {
+							print $chr,"\t";
+							print $start+length($ref_allele)-1,"\t";
+							print $start+length($ref_allele)-1,"\t";
+							$mut_allele = substr ($mut_allele2, length ($ref_allele));
+							print "-\t";
+							print $mut_allele2;
+						} else {
+							print $chr, "\t", $start, "\t", $start+length($ref_allele)-1, "\t", $ref_allele, "\t", $mut_allele2;
+						}
+					}
+					if (defined $sample) {
+						if ($sample =~ m#^0/1# or $sample =~ m#^1/0#) {
+							print "\thet";
+							$counthet++;
+						} elsif ($sample =~ m#^1/1#) {
+							print "\thom";
+							$counthom++;
+						} # BEGIN ARQ
+						elsif ($sample =~ m#^./.#) {
+							print "\tunknown";
+							$countunknown++;
+						} # END ARQ
+					}
+					print "\t", $quality_score;
+					defined $unfiltered_read_depth and print "\t", $unfiltered_read_depth;
+					defined $indel_read_depth1 and print "\t", $indel_read_depth1;
+					defined $MappingQuality and print "\t", $MappingQuality;
+					$includeinfo and print "\t", $otherinfo;
+					print "\n";
+				}
+			}
+		}
+		$countvar++;
+	}
+	my $triallelic = $countsnp-$countti-$counttv;
+	print STDERR "NOTICE: Read $countline lines and wrote ${\($counthet+$counthom)} different variants at $countvar genomic positions ($countsnp SNPs and $countindel indels)\n";
+	print STDERR "NOTICE: Among ${\($counthet+$counthom+$countunknown)} different variants at $countvar positions, $counthet are heterozygotes, $counthom are homozygotes\n";
+	print STDERR "NOTICE: Among $countsnp SNPs, $countti are transitions, $counttv are transversions", $triallelic?", $triallelic have more than 2 alleles\n":"\n";
+}
+=head1 SYNOPSIS
+convert2annovar.pl [arguments] <variantfile>
+Optional arguments:
+-h, --help                      print help message
+-m, --man                       print complete documentation
+-v, --verbose                   use verbose output
+--format <string>		input format (default: pileup)
+--outfile <file>		output file name (default: STDOUT)
+--snpqual <float>		quality score threshold in pileup file (default: 20)
+--snppvalue <float>		SNP P-value threshold in GFF3-SOLiD file (default: 1)
+--coverage <int>		read coverage threshold in pileup file (default: 0)
+--maxcoverage <int>		maximum coverage threshold (default: none)
+--includeinfo		include supporting information in output
+--chr <string>		specify the chromosome (for CASAVA format)
+--chrmt <string>		chr identifier for mitochondria (default: M)
+--altcov <int>		alternative allele coverage threshold (for pileup format)
+--fraction <float>		minimum allelic fraction to claim a mutation (for pileup/vcf4_indel format)
+--species <string>		if human, convert chr23/24/25 to X/Y/M (for gff3-solid format)
+--filter <string>		output variants with this filter (case insensitive, for vcf4 format)
+--confraction <float>	minimum consensus indel / all indel fraction (for vcf4_indel format)
+--allallele			print all alleles when multiple calls are present (for vcf4 format)
+Function: convert variant call file generated from various software programs
+into ANNOVAR input format
+Example: convert2annovar.pl -format pileup -outfile variant.query variant.pileup
+convert2annovar.pl -format cg -outfile variant.query variant.cg
+convert2annovar.pl -format gff3-solid -outfile variant.query variant.snp.gff
+convert2annovar.pl -format soap variant.snp > variant.avinput
+convert2annovar.pl -format maq variant.snp > variant.avinput
+convert2annovar.pl -format casava -chr 1 variant.snp > variant.avinput
+convert2annovar.pl -format vcf4 variantfile > variant.avinput
+convert2annovar.pl -format vcf4 -filter pass variantfile > variant.avinput
+Version: $LastChangedDate: 2011-05-06 05:16:44 -0700 (Fri, 06 May 2011) $
+=head1 OPTIONS
+=over 8
+=item B<--help>
+print a brief usage message and detailed explanation of options.
+=item B<--man>
+print the complete manual of the program.
+=item B<--verbose>
+use verbose output.
+=item B<--format>
+the format of the input files.
+=item B<--outfile>
+specify the output file name. By default, output is written to STDOUT.
+=item B<--snpqual>
+quality score threshold in the pileup file, such that variant calls with lower
+quality scores will not be printed out in the output file. When VCF4 file is
+used, this argument works on the Quality-by-Depth measure, rather than the raw
+quality measure.
+=item B<--coverage>
+read coverage threshold in the pileup file, such that variants calls generated
+with lower coverage will not be printed in the output file.
+=item B<--includeinfo>
+specify that the output should contain additional information in the input line.
+By default, only the chr, start, end, reference allele, observed allele and
+homozygosity status are included in output files.
+=item B<--chr>
+specify the chromosome for CASAVA format
+=item B<--chrmt>
+specify the name of mitochondria chromosome (default is MT)
+=item B<--altcov>
+the minimum coverage of the alternative (mutated) allele to be printed out in
+output
+=item B<--fraction>
+specify the minimum fraction of alternative allele, to print out the mutation.
+For example, a site has 10 reads, 3 supports alternative allele. A -fraction of
+0.4 will not allow the mutation to be printed out.
+=item B<--species>
+specify the species from which the sequencing data is obtained. For the GFF3-
+SOLiD format, when species is human, the chromosome 23, 24 and 25 will be
+converted to X, Y and M, respectively.
+=item B<--filter>
+for VCF4 file, only print out variant calls with this filter annotated. For
+example, if using GATK VariantFiltration walker, you will see PASS,
+GATKStandard, HARD_TO_VALIDATE, etc in the filter field. Using 'pass' as a
+filter is recommended in this case.
+=item B<--confraction>
+consesus indel fraction, calculated as reads supporting consensus indels divided
+by reads supporting any indels
+=item B<--allallele>
+print all alleles for mutations at a locus, rather than the first allele, if the
+input VCF4 file contains multiple alternative alleles for a mutation. By
+default, this option is off. When it is on, two lines will be printed out in the
+output, and both will have the same quality scores as VCF4 does not provide
+separate quality scores for individual alleles.
+=back
+=head1 DESCRIPTION
+This program is used to convert variant call file generated from various
+software programs into ANNOVAR input format. Currently, the program can handle
+Samtools genotype-calling pileup format, Solid GFF format, Complete Genomics
+variant format, SOAP format. These formats are described below.
+=over 8
+=item * B<pileup format>
+The pileup format can be produced by the Samtools genotyping calling subroutine.
+Note that the phrase pileup format can be used in several instances, and here I
+am only referring to the pileup files that contains the actual genotype calls.
+Using SamTools, given an alignment file in BAM format, a pileup file with
+genotype calls can be produced by the command below:
+	samtools pileup -vcf ref.fa aln.bam> raw.pileup
+	samtools.pl varFilter raw.pileup > final.pileup
+ANNOVAR will automatically filter the pileup file so that only SNPs reaching a
+quality threshold are printed out (default is 20, use --snpqual argument to
+change this). Most likely, users may want to also apply a coverage threshold,
+such that SNPs calls from only a few reads are not considered. This can be
+achieved using the -coverage argument (default value is 0).
+An example of pileup files for SNPs is shown below:
+	chr1 556674 G G 54 0 60 16 a,.....,...,.... (B%A+%7B;0;%=B<:
+	chr1 556675 C C 55 0 60 16 ,,..A..,...,.... CB%%5%,A/+,%....
+	chr1 556676 C C 59 0 60 16 g,.....,...,.... .B%%.%.?.=/%...1
+	chr1 556677 G G 75 0 60 16 ,$,.....,...,.... .B%%9%5A6?)%;?:<
+	chr1 556678 G K 60 60 60 24 ,$.....,...,....^~t^~t^~t^~t^~t^~t^~t^~t^~t B%%B%<A;AA%??<=??;BA%B89
+	chr1 556679 C C 61 0 60 23 .....a...a....,,,,,,,,, %%1%&?*:2%*&)(89/1A@B@@
+	chr1 556680 G K 88 93 60 23 ..A..,..A,....ttttttttt %%)%7B:B0%55:7=>>A@B?B;
+	chr1 556681 C C 102 0 60 25 .$....,...,....,,,,,,,,,^~,^~. %%3%.B*4.%.34.6./B=?@@>5.
+	chr1 556682 A A 70 0 60 24 ...C,...,....,,,,,,,,,,. %:%(B:A4%7A?;A><<999=<<
+	chr1 556683 G G 99 0 60 24 ....,...,....,,,,,,,,,,. %A%3B@%?%C?AB@BB/./-1A7?
+The columns are chromosome, 1-based coordinate, reference base, consensus base,
+consensus quality, SNP quality, maximum mapping quality of the reads covering
+the sites, the number of reads covering the site, read bases and base qualities.
+An example of pileup files for indels is shown below:
+	seq2  156 *  +AG/+AG  71  252  99  11  +AG  *  3  8  0
+ANNOVAR automatically recognizes both SNPs and indels in pileup file, and process them correctly.
+=item * B<GFF3-SOLiD format>
+The SOLiD provides a GFF3-compatible format for SNPs, indels and structural
+variants. A typical example file is given below:
+	##gff-version 3
+	##solid-gff-version 0.3
+	##source-version 2
+	##type DNA
+	##date 2009-03-13
+	##time 0:0:0
+	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
+	##reference-file
+	##input-files Yoruban_snp_10x.txt
+	##run-path
+	chr_name        AB_SOLiD SNP caller     SNP     coord   coord   1       .       .       coverage=# cov;ref_base=ref;ref_score=score;ref_confi=confi;ref_single=Single;ref_paired=Paired;consen_base=consen;consen_score=score;consen_confi=conf;consen_single=Single;consen_paired=Paired;rs_id=rs_id,dbSNP129
+	1       AB_SOLiD SNP caller     SNP     997     997     1       .       .       coverage=3;ref_base=A;ref_score=0.3284;ref_confi=0.9142;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6716;consen_confi=0.9349;consen_single=0/0;consen_paired=2/2
+	1       AB_SOLiD SNP caller     SNP     2061    2061    1       .       .       coverage=2;ref_base=G;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.8985;consen_single=0/0;consen_paired=2/2
+	1       AB_SOLiD SNP caller     SNP     4770    4770    1       .       .       coverage=2;ref_base=A;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=G;consen_score=1.0000;consen_confi=0.8854;consen_single=0/0;consen_paired=2/2
+	1       AB_SOLiD SNP caller     SNP     4793    4793    1       .       .       coverage=14;ref_base=A;ref_score=0.0723;ref_confi=0.8746;ref_single=0/0;ref_paired=1/1;consen_base=G;consen_score=0.6549;consen_confi=0.8798;consen_single=0/0;consen_paired=9/9
+	1       AB_SOLiD SNP caller     SNP     6241    6241    1       .       .       coverage=2;ref_base=T;ref_score=0.0000;ref_confi=0.0000;ref_single=0/0;ref_paired=0/0;consen_base=C;consen_score=1.0000;consen_confi=0.7839;consen_single=0/0;consen_paired=2/2
+Newer version of ABI BioScope now use diBayes caller, and the output file is given below:
+	##gff-version 3
+	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
+	##List of SNPs. Date Sat Dec 18 10:30:45 2010    Stringency: medium Mate Pair: 1 Read Length: 50 Polymorphism Rate: 0.003000 Bayes Coverage: 60 Bayes_Single_SNP: 1 Filter_Single_SNP: 1 Quick_P_Threshold: 0.997000 Bayes_P_Threshold: 0.040000 Minimum_Allele_Ratio: 0.150000 Minimum_Allele_Ratio_Multiple_of_Dicolor_Error: 100
+	##1     chr1
+	##2     chr2
+	##3     chr3
+	##4     chr4
+	##5     chr5
+	##6     chr6
+	##7     chr7
+	##8     chr8
+	##9     chr9
+	##10    chr10
+	##11    chr11
+	##12    chr12
+	##13    chr13
+	##14    chr14
+	##15    chr15
+	##16    chr16
+	##17    chr17
+	##18    chr18
+	##19    chr19
+	##20    chr20
+	##21    chr21
+	##22    chr22
+	##23    chrX
+	##24    chrY
+	##25    chrM
+	# source-version SOLiD BioScope diBayes(SNP caller)
+	#Chr    Source  Type    Pos_Start       Pos_End Score   Strand  Phase   Attributes
+	chr1    SOLiD_diBayes   SNP     221367  221367  0.091151        .       .       genotype=R;reference=G;coverage=3;refAlleleCounts=1;refAlleleStarts=1;refAlleleMeanQV=29;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=27;diColor1=11;diColor2=33;het=1;flag=
+	chr1    SOLiD_diBayes   SNP     555317  555317  0.095188        .       .       genotype=Y;reference=T;coverage=13;refAlleleCounts=11;refAlleleStarts=10;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=00;diColor2=22;het=1;flag=
+	chr1    SOLiD_diBayes   SNP     555327  555327  0.037582        .       .       genotype=Y;reference=T;coverage=12;refAlleleCounts=6;refAlleleStarts=6;refAlleleMeanQV=19;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=29;diColor1=12;diColor2=30;het=1;flag=
+	chr1    SOLiD_diBayes   SNP     559817  559817  0.094413        .       .       genotype=Y;reference=T;coverage=9;refAlleleCounts=5;refAlleleStarts=4;refAlleleMeanQV=23;novelAlleleCounts=2;novelAlleleStarts=2;novelAlleleMeanQV=14;diColor1=11;diColor2=33;het=1;flag=
+	chr1    SOLiD_diBayes   SNP     714068  714068  0.000000        .       .       genotype=M;reference=C;coverage=13;refAlleleCounts=7;refAlleleStarts=6;refAlleleMeanQV=25;novelAlleleCounts=6;novelAlleleStarts=4;novelAlleleMeanQV=22;diColor1=00;diColor2=11;het=1;flag=
+	The file conforms to standard GFF3 specifications, but the last column is solid-
+	specific and it gives certain parameters for the SNP calls.
+An example of the short indel format by GFF3-SOLiD is given below:
+	##gff-version 3
+	##solid-gff-version 0.3
+	##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49
+	##type DNA
+	##date 2009-01-26
+	##time 18:33:20
+	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
+	##reference-file
+	##input-files ../../mp-results/JOAN_20080104_1.pas,../../mp-results/BARB_20071114_1.pas,../../mp-results/BARB_20080227_2.pas
+	##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-w2x25-2x-4x-8x-10x/2x
+	##Filter-settings: max-ave-read-pos=none,min-ave-from-end-pos=9.1,max-nonreds-4filt=2,min-insertion-size=none,min-deletion-size=none,max-insertion-size=none,max-deletion-size=none,require-called-indel-size?=T
+	chr1    AB_SOLiD Small Indel Tool       deletion        824501  824501  1       .       .       del_len=1;tight_chrom_pos=824501-824502;loose_chrom_pos=824501-824502;no_nonred_reads=2;no_mismatches=1,0;read_pos=4,6;from_end_pos=21,19;strands=+,-;tags=R3,F3;indel_sizes=-1,-1;read_seqs=G3021212231123203300032223,T3321132212120222323222101;dbSNP=rs34941678,chr1:824502-824502(-),EXACT,1,/GG
+	chr1    AB_SOLiD Small Indel Tool       insertion_site  1118641 1118641 1       .       .       ins_len=3;tight_chrom_pos=1118641-1118642;loose_chrom_pos=1118641-1118642;no_nonred_reads=2;no_mismatches=0,1;read_pos=17,6;from_end_pos=8,19;strands=+,+;tags=F3,R3;indel_sizes=3,3;read_seqs=T0033001100022331122033112,G3233112203311220000001002
+The keyword deletion or insertion_site is used in the fourth column to indicate
+that file format.
+An example of the medium CNV format by GFF3-SOLiD is given below:
+	##gff-version 3
+	##solid-gff-version 0.3
+	##source-version SOLiD Corona Lite v.4.0r2.0, find-small-indels.pl v 1.0.1, process-small-indels v 0.2.2, 2009-01-12 12:28:49
+	##type DNA
+	##date 2009-01-27
+	##time 15:54:36
+	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
+	##reference-file
+	##input-files big_d20e5-del12n_up-ConsGrp-2nonred.pas.sum
+	##run-path /data/results2/Yoruban-frag-indel/try.01.06/mp-results-lmp-e5/big_d20e5-indel_950_2050
+	chr1    AB_SOLiD Small Indel Tool       deletion        3087770 3087831 1       .       .       del_len=62;tight_chrom_pos=none;loose_chrom_pos=3087768-3087773;no_nonred_reads=2;no_mismatches=2,2;read_pos=27,24;from_end_pos=23,26;strands=-,+;tags=F3,F3;indel_sizes=-62,-62;read_seqs=T11113022103331111130221213201111302212132011113022,T02203111102312122031111023121220311111333012203111
+	chr1    AB_SOLiD Small Indel Tool       deletion        4104535 4104584 1       .       .       del_len=50;tight_chrom_pos=4104534-4104537;loose_chrom_pos=4104528-4104545;no_nonred_reads=3;no_mismatches=0,4,4;read_pos=19,19,27;from_end_pos=31,31,23;strands=+,+,-;tags=F3,R3,R3;indel_sizes=-50,-50,-50;read_seqs=T31011011013211110130332130332132110110132020312332,G21031011013211112130332130332132110132132020312332,G20321302023001101123123303103303101113231011011011
+	chr1    AB_SOLiD Small Indel Tool       insertion_site  2044888 2044888 1       .       .       ins_len=18;tight_chrom_pos=2044887-2044888;loose_chrom_pos=2044887-2044889;no_nonred_reads=2;bead_ids=1217_1811_209,1316_908_1346;no_mismatches=0,2;read_pos=13,15;from_end_pos=37,35;strands=-,-;tags=F3,F3;indel_sizes=18,18;read_seqs=T31002301231011013121000101233323031121002301231011,T11121002301231011013121000101233323031121000101231;non_indel_no_mismatches=3,1;non_indel_seqs=NIL,NIL
+	chr1    AB_SOLiD Small Indel Tool       insertion_site  74832565        74832565        1       .       .       ins_len=16;tight_chrom_pos=74832545-74832565;loose_chrom_pos=74832545-74832565;no_nonred_reads=2;bead_ids=1795_181_514,1651_740_519;no_mismatches=0,2;read_pos=13,13;from_end_pos=37,37;strands=-,-;tags=F3,R3;indel_sizes=16,16;read_seqs=T33311111111111111111111111111111111111111111111111,G23311111111111111111111111111111111111111311011111;non_indel_no_mismatches=1,0;non_indel_seqs=NIL,NIL
+An example of the large indel format by GFF3-SOLiD is given below:
+	##gff-version 3
+	##solid-gff-version 0.3
+	##source-version ???
+	##type DNA
+	##date 2009-03-13
+	##time 0:0:0
+	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
+	##reference-file
+	##input-files /data/results5/yoruban_strikes_back_large_indels/LMP/five_mm_unique_hits_no_rescue/5_point_6x_del_lib_1/results/NA18507_inter_read_indels_5_point_6x.dat
+	##run-path
+	chr1    AB_SOLiD Large Indel Tool       insertion_site  1307279 1307791 1       .       .       deviation=-742;stddev=7.18;ref_clones=-;dev_clones=4
+	chr1    AB_SOLiD Large Indel Tool       insertion_site  2042742 2042861 1       .       .       deviation=-933;stddev=8.14;ref_clones=-;dev_clones=3
+	chr1    AB_SOLiD Large Indel Tool       insertion_site  2443482 2444342 1       .       .       deviation=-547;stddev=11.36;ref_clones=-;dev_clones=17
+	chr1    AB_SOLiD Large Indel Tool       insertion_site  2932046 2932984 1       .       .       deviation=-329;stddev=6.07;ref_clones=-;dev_clones=14
+	chr1    AB_SOLiD Large Indel Tool       insertion_site  3166925 3167584 1       .       .       deviation=-752;stddev=13.81;ref_clones=-;dev_clones=14
+An example of the CNV format by GFF3-SOLiD if given below:
+	##gff-version 3
+	##solid-gff-version 0.3
+	##source-version ???
+	##type DNA
+	##date 2009-03-13
+	##time 0:0:0
+	##feature-ontology http://song.cvs.sourceforge.net/*checkout*/song/ontology/sofa.obo?revision=1.141
+	##reference-file
+	##input-files Yoruban_cnv.coords
+	##run-path
+	chr1    AB_CNV_PIPELINE repeat_region   1062939 1066829 .       .       .       fraction_mappable=51.400002;logratio=-1.039300;copynum=1;numwindows=1
+	chr1    AB_CNV_PIPELINE repeat_region   1073630 1078667 .       .       .       fraction_mappable=81.000000;logratio=-1.409500;copynum=1;numwindows=2
+	chr1    AB_CNV_PIPELINE repeat_region   2148325 2150352 .       .       .       fraction_mappable=98.699997;logratio=-1.055000;copynum=1;numwindows=1
+	chr1    AB_CNV_PIPELINE repeat_region   2245558 2248109 .       .       .       fraction_mappable=78.400002;logratio=-1.042900;copynum=1;numwindows=1
+	chr1    AB_CNV_PIPELINE repeat_region   3489252 3492632 .       .       .       fraction_mappable=59.200001;logratio=-1.119900;copynum=1;numwindows=1
+	chr1    AB_CNV_PIPELINE repeat_region   5654415 5657276 .       .       .       fraction_mappable=69.900002;logratio=1.114500;copynum=4;numwindows=1
+	chr1    AB_CNV_PIPELINE repeat_region   9516165 9522726 .       .       .       fraction_mappable=65.850006;logratio=-1.316700;numwindows=2
+	chr1    AB_CNV_PIPELINE repeat_region   16795117        16841025        .       .       .       fraction_mappable=44.600002;logratio=1.880778;copynum=7;numwindows=9
+The keyword repeat_region is used here, although it actually refers to CNVs.
+An example of the inversion format by GFF3-SOLiD is given below:
+	##gff-version 3
+	##solid-gff-version 0.2
+	##generated by SOLiD inversion tool
+	chr10   AB_SOLiD        inversion       46443107        46479585        268.9   .       .       left=chr10:46443107-46443146;right=chr10:46479583-46479585;leftscore=295.0;rightscore=247.0;count_AAA_further_left=117;count_AAA_left=3;count_AAA_right=3;count_AAA_further_right=97;left_min_count_AAA=chr10:46443107-46443112;count_AAA_min_left=0;count_AAA_max_left=3;right_min_count_AAA=chr10:46479585-46479585;count_AAA_min_right=1;count_AAA_max_right=3;homozygous=UNKNOWN
+	chr4    AB_SOLiD        inversion       190822813       190850112       214.7   .       .       left=chr4:190822813-190822922;right=chr4:190850110-190850112;leftscore=140.0;rightscore=460.0;count_AAA_further_left=110;count_AAA_left=78;count_AAA_right=74;count_AAA_further_right=77;left_min_count_AAA=chr4:190822813-190822814;count_AAA_min_left=69;count_AAA_max_left=77;right_min_count_AAA=chr4:190850110-190850112;count_AAA_min_right=74;count_AAA_max_right=74;homozygous=NO
+	chr6    AB_SOLiD        inversion       168834969       168837154       175.3   .       .       left=chr6:168834969-168835496;right=chr6:168836643-168837154;leftscore=185.4;rightscore=166.2;count_AAA_further_left=67;count_AAA_left=43;count_AAA_right=40;count_AAA_further_right=59;left_min_count_AAA=chr6:168835058-168835124,chr6:168835143-168835161,chr6:168835176-168835181,chr6:168835231-168835262;count_AAA_min_left=23;count_AAA_max_left=29;right_min_count_AAA=chr6:168836643-168836652;count_AAA_min_right=23;count_AAA_max_right=31;homozygous=NO
+The program should be able to recognize all the above GFF3-SOLiD format
+automatically, and handle them accordingly.
+=item * B<Complete Genomics format>
+This format is provided by the Complete Genomics company to their customers. The
+file var-[ASM-ID].tsv.bz2 includes a description of all loci where the assembled
+genome differs from the reference genome.
+An example of the Complete Genomics format is shown below:
+	#BUILD  1.5.0.5
+	#GENERATED_AT   2009-Nov-03 19:52:21.722927
+	#GENERATED_BY   dbsnptool
+	#TYPE   VAR-ANNOTATION
+	#VAR_ANN_SET    /Proj/Pipeline/Production_Data/REF/HUMAN-F_06-REF/dbSNP.csv
+	#VAR_ANN_TYPE   dbSNP
+	#VERSION        0.3
+	>locus  ploidy  haplotype       chromosome      begin   end     varType reference       alleleSeq       totalScore      hapLink xRef
+	1       2       all     chr1    0       959     no-call =       ?
+	2       2       all     chr1    959     972     =       =       =
+	3       2       all     chr1    972     1001    no-call =       ?
+	4       2       all     chr1    1001    1008    =       =       =
+	5       2       all     chr1    1008    1114    no-call =       ?
+	6       2       all     chr1    1114    1125    =       =       =
+	7       2       all     chr1    1125    1191    no-call =       ?
+	8       2       all     chr1    1191    1225    =       =       =
+	9       2       all     chr1    1225    1258    no-call =       ?
+	10      2       all     chr1    1258    1267    =       =       =
+	12      2       all     chr1    1267    1275    no-call =       ?
+	13      2       all     chr1    1275    1316    =       =       =
+	14      2       all     chr1    1316    1346    no-call =       ?
+	15      2       all     chr1    1346    1367    =       =       =
+	16      2       all     chr1    1367    1374    no-call =       ?
+	17      2       all     chr1    1374    1388    =       =       =
+	18      2       all     chr1    1388    1431    no-call =       ?
+	19      2       all     chr1    1431    1447    =       =       =
+	20      2       all     chr1    1447    1454    no-call =       ?
+The following information is provided in documentation from Complete Genomics, that describes the var-ASM format.
+	1. locus. Identifier of a particular genomic locus
+	2. ploidy. The ploidy of the reference genome at the locus (= 2 for autosomes, 2 for pseudoautosomal regions on the sex chromosomes, 1 for males on the non-pseudoautosomal parts of the sex chromosomes, 1 for mitochondrion, '?' if varType is 'no-ref' or 'PAR-called-in-X'). The reported ploidy is fully determined by gender, chromosome and location, and is not inferred from the sequence data.
+	3. haplotype. Identifier for each haplotype at the variation locus. For diploid genomes, 1 or 2. Shorthand of 'all' is allowed where the varType field is one of 'ref', 'no-call', 'no-ref', or 'PAR-called-in-X'. Haplotype numbering does not imply phasing; haplotype 1 in locus 1 is not necessarily in phase with haplotype 1 in locus 2. See hapLink, below, for phasing information.
+	4. chromosome. Chromosome name in text: 'chr1','chr2', ... ,'chr22','chrX','chrY'. The mitochondrion is represented as 'chrM'. The pseudoautosomal regions within the sex chromosomes X and Y are reported at their coordinates on chromosome X.
+	5. begin. Reference coordinate specifying the start of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information.
+	6. end. Reference coordinate specifying the end of the variation (not the locus) using the half-open zero-based coordinate system. See section 'Sequence Coordinate System' for more information.
+	7. varType. Type of variation, currently one of:
+		snp: single-nucleotide polymorphism
+		ins: insertion
+		del: deletion
+		sub: Substitution of one or more reference bases with the bases in the allele column
+		'ref' : no variation; the sequence is identical to the reference sequence on the indicated haplotype
+		no-call-rc: 'no-call reference consistent 'one or more bases are ambiguous, but the allele is potentially consistent with the reference
+		no-call-ri: 'no-call reference inconsistent' one or more bases are ambiguous, but the allele is definitely inconsistent with the reference
+		no-call: an allele is completely indeterminate in length and composition, i.e. alleleSeq = '?'
+		no-ref: the reference sequence is unspecified at this locus.
+		PAR-called-in-X: this locus overlaps one of the pseudoautosomal regions on the sex chromosomes. The called sequence is reported as diploid sequence on Chromosome X; on chromosome Y the sequence is reported as varType = 'PAR-called-in-X'.
+	8. reference. The reference sequence for the locus of variation. Empty when varType is ins. A value of '=' indicates that the user must consult the reference for the sequence; this shorthand is only used in regions where no haplotype deviates from the reference sequence.
+	9. alleleSeq. The observed sequence at the locus of variation. Empty when varType is del. '?' isused to indicate 0 or more unknown bases within the sequence; 'N' is used to indicate exactly one unknown base within the sequence.'=' is used as shorthand to indicate identity to the reference sequence for non-variant sequence, i.e. when varType is 'ref'.
+	10. totalScore. A score corresponding to a single variation and haplotype, representing the confidence in the call.
+	11. hapLink. Identifier that links a haplotype at one locus to haplotypes at other loci. Currently only populated for very proximate variations that were assembled together. Two calls that share a hapLink identifier are expected to be on the same haplotype,
+	12. xRef. Field containing external variation identifiers, currently only populated for variations corroborated directly by dbSNP. Format: dbsnp:[rsID], with multiple entries separated by the semicolon (;).
+In older versions of the format specification, the sub keyword used to be insdel
+keyword. ANNOVAR takes care of this.
+=item * B<SOAPsnp format>
+An example of the SOAP SNP caller format is shown below:
+	chr8  35782  A  R  1  A  27  1  2  G  26  1  2  5   0.500000  2.00000  1  5
+	chr8  35787  G  R  0  G  25  4  6  A  17  2  4  10  0.266667  1.60000  0  5
+The following information is provided in documentation from BGI who developed
+SOAP suite. It differs slightly from the description at the SOAPsnp website, and
+presumably the website is outdated.
+	Format description:(left to right)
+	1. Chromosome name
+	2. Position of locus
+	3. Nucleotide at corresponding locus of reference sequence
+	4. Genotype of sequencing sample
+	5. Quality value
+	6. nucleotide with the highest probability(first nucleotide)
+	7. Quality value of the nucleotide with the highest probability
+	8. Number of supported reads that can only be aligned to this locus
+	9. Number of all supported reads that can be aligned to this locus
+	10. Nucleotide with higher probability
+	11. Quality value of nucleotide with higher probability
+	12. Number of supported reads that can only be aligned to this locus
+	13. Number of all supported reads that can be aligned to this locus
+	14. Total number of reads that can be aligned to this locus
+	15. Order and quality value
+	16. Estimated copy number for this locus
+	17. Presence of this locus in the dbSNP database. 1 refers to presence and 0 refers to inexistence
+	18. The distance between this locus and another closest SNP
+=item * B<SOAPindel format>
+The current version of ANNOVAR handles SoapSNP and SoapIndel automatically via a
+single argument '--format soap'. An example of SOAP indel caller format is shown
+below:
+	chr11   44061282        -       +2      CT      Hete
+	chr11   45901572        +       +1      C       Hete
+	chr11   48242562        *       -3      TTC     Homo
+	chr11   57228723        *       +4      CTTT    Homo
+	chr11   57228734        *       +4      CTTT    Homo
+	chr11   57555685        *       -1      C       Hete
+	chr11   61482191        -       +3      TCC     Hete
+	chr11   64608031        *       -1      T       Homo
+	chr11   64654936        *       +1      C       Homo
+	chr11   71188303        +       -1      T       Hete
+	chr11   75741034        +       +1      T       Hete
+	chr11   76632438        *       +1      A       Hete
+	chr11   89578266        *       -2      AG      Homo
+	chr11   104383261       *       +1      T       Hete
+	chr11   124125940       +       +4      CCCC    Hete
+	chr12   7760052 *       +1      T       Homo
+	chr12   8266049 *       +3      ACG     Homo
+I do not see a documentation describing this format yet as of September 2010.
+=item B<--SOAPsv format>
+An example is given below:
+	Chr2 Deletion 42894 43832 43167 43555 388 0-0-0 FR 41
+An explanation of the structural variation format is given below:
+	Format description (from left to right)
+	1. Chromosome name
+	2. Type of structure variation
+	3. Minimal value of start position in cluster
+	4. Maximal value of end position in cluster
+	5. Estimated start position of this structure variation
+	6. Estimated end position of this structure variation
+	7. Length of SV
+	8. Breakpoint of SV (only for insertion)
+	9. Unusual matching mode (F refers to align with forward sequence, R refers
+	to align with reverse
+	sequence)
+	10. number of paired-end read which support this structure variation
+=item * B<MAQ format>
+MAQ can perform alignment and generate genotype calls, including SNP calls and
+indel calls. The format is described below:
+For indel header: The output is TAB delimited with each line consisting of chromosome, start
+position, type of the indel, number of reads across the indel, size of the indel
+and inserted/deleted nucleotides (separated by colon), number of indels on the
+reverse strand, number of indels on the forward strand, 5' sequence ahead of the
+indel, 3' sequence following the indel, number of reads aligned without indels
+and three additional columns for filters.
+An example is below:
+	chr10   110583  -       2       -2:AG   0       1       GCGAGACTCAGTATCAAAAAAAAAAAAAAAAA        AGAAAGAAAGAAAAAGAAAAAAATAGAAAGAA        1       @2,     @72,   @0,
+	chr10   120134  -       8       -2:CA   0       1       CTCTTGCCCGCTCACACATGTACACACACGCG        CACACACACACACACACATCAGCTACCTACCT        7       @65,62,61,61,45,22,7,   @9,12,13,13,29,52,67,   @0,0,0,0,0,0,0,
+	chr10   129630  -       1       -1:T    1       0       ATGTTGTGACTCTTAATGGATAAGTTCAGTCA        TTTTTTTTTAGCTTTTAACCGGACAAAAAAAG        0       @       @      @
+	chr10   150209  -       1       4:TTCC  1       0       GCATATAGGGATGGGCACTTTACCTTTCTTTT        TTCCTTCCTTCCTTCCTTCCCTTTCCTTTCCT        0       @       @      @
+	chr10   150244  -       2       -4:TTCT 0       1       CTTCCTTCCTTCCTTCCCTTTCCTTTCCTTTC        TTCTTTCTTTCTTTCTTTCTTTTTTTTTTTTT        0       @       @      @
+	chr10   159622  -       1       3:AGG   0       1       GAAGGAGGAAGGACGGAAGGAGGAAGGAAGGA        AGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGA        0       @       @      @
+	chr10   206372  -       2       2:GT    1       0       ATAATAGTAACTGTGTATTTGATTATGTGTGC        GTGTGTGTGTGTGTGTGTGTGTGTGCGTGCTT        1       @37,    @37,   @8,
+	chr10   245751  -       11      -1:C    0       1       CTCATAAATACAAGTCATAATGAAAGAAATTA        CCACCATTTTCTTATTTTCATTCATTTTTAGT        10      @69,64,53,41,30,25,22,14,5,4,   @5,10,21,33,44,49,52,60,69,70,  @0,0,0,0,0,0,0,0,0,0,
+	chr10   253066  -       1       2:TT    0       1       TATTGATGAGGGTGGATTATACTTTAGAACAC        TATTCAAACAGTTCTTCCACATATCTCCCTTT        0       @       @      @
+	chr10   253455  -       2       -3:AAA  1       0       GTTGCACTCCAGCCTGGCGAGATTCTGTCTCC        AAAAAAAAAAAAAAAAATTGTTGTGAAATACA        1       @55,    @19,   @4,
+For snp output file: Each line consists of chromosome, position, reference base,
+consensus base, Phred-like consensus quality, read depth, the average number of
+hits of reads covering this position, the highest mapping quality of the reads
+covering the position, the minimum consensus quality in the 3bp flanking regions
+at each side of the site (6bp in total), the second best call, log likelihood
+ratio of the second best and the third best call, and the third best call.
+An example is below:
+	chr10   83603   C       T       28      12      2.81    63      34      Y       26      C
+	chr10   83945   G       R       59      61      4.75    63      62      A       47      G
+	chr10   83978   G       R       47      40      3.31    63      62      A       21      G
+	chr10   84026   G       R       89      22      2.44    63      62      G       49      A
+	chr10   84545   C       T       54      9       1.69    63      30      N       135     N
+	chr10   85074   G       A       42      5       1.19    63      38      N       108     N
+	chr10   85226   A       T       42      5       1.00    63      42      N       107     N
+	chr10   85229   C       T       42      5       1.00    63      42      N       112     N
+	chr10   87518   A       G       39      4       3.25    63      38      N       9       N
+	chr10   116402  T       C       39      4       1.00    63      38      N       76      N
+=item * B<CASAVA format>
+An example of Illumina CASAVA format is given below:
+	#position       A       C       G       T       modified_call   total   used    score           reference       type
+	14930   3       0       8       0       GA      11      11      29.10:11.10             A       SNP_het2
+	14933   4       0       7       0       GA      11      11      23.50:13.70             G       SNP_het1
+	14976   3       0       8       0       GA      11      11      24.09:9.10              G       SNP_het1
+	15118   2       1       4       0       GA      8       7       10.84:6.30              A       SNP_het2
+An example of the indels is given below:
+	# ** CASAVA depth-filtered indel calls **
+	#$ CMDLINE /illumina/pipeline/install/CASAVA_v1.7.0/libexec/CASAVA-1.7.0/filterIndelCalls.pl--meanReadDepth=2.60395068970547 --indelsCovCutoff=-1 --chrom=chr1.fa /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0000.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0001.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0002.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0003.txt /data/Basecalls/100806_HARMONIAPILOT-H16_0338_A2065HABXX/Data/Intensities/BaseCalls/CASAVA_PE_L2/Parsed_14-08-10/chr1.fa/Indel/varling_indel_calls_0004.txt
+	#$ CHROMOSOME chr1.fa
+	#$ MAX_DEPTH undefined
+	#
+	#$ COLUMNS pos CIGAR ref_upstream ref/indel ref_downstream Q(indel) max_gtype Q(max_gtype) max2_gtype bp1_reads ref_reads indel_reads other_reads repeat_unit ref_repeat_count indel_repeat_count
+	948847  1I      CCTCAGGCTT      -/A     ATAATAGGGC      969     hom     47      het     22      0       16      6       A       1       2
+	978604  2D      CACTGAGCCC      CT/--   GTGTCCTTCC      251     hom     20      het     8       0       4       4       CT      1       0
+	1276974 4I      CCTCATGCAG      ----/ACAC       ACACATGCAC      838     hom     39      het     18      0       14      4       AC      2       4
+	1289368 2D      AGCCCGGGAC      TG/--   GGAGCCGCGC      1376    hom     83      het     33      0       25      9       TG      1       0
+=item * B<VCF4 format>
+VCF4 can be used to describe both population-level variation information, or for
+reads derived from a single individual.
+One example of the indel format for one individual is given below:
+	##fileformat=VCFv4.0
+	##IGv2_bam_file_used=MIAPACA2.alnReAln.bam
+	##INFO=<ID=AC,Number=2,Type=Integer,Description="# of reads supporting consensus indel/any indel at the site">
+	##INFO=<ID=DP,Number=1,Type=Integer,Description="total coverage at the site">
+	##INFO=<ID=MM,Number=2,Type=Float,Description="average # of mismatches per consensus indel-supporting read/per reference-supporting read">
+	##INFO=<ID=MQ,Number=2,Type=Float,Description="average mapping quality of consensus indel-supporting reads/reference-supporting reads">
+	##INFO=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads">
+	##INFO=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads">
+	##INFO=<ID=SC,Number=4,Type=Integer,Description="strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads">
+	##IndelGenotyperV2=""
+	##reference=hg18.fa
+	##source=IndelGenotyperV2
+	#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  Miapaca_trimmed_sorted.bam
+	chr1    439     .       AC      A       .       PASS    AC=5,5;DP=7;MM=7.0,3.0;MQ=23.4,1.0;NQSBQ=23.98,25.5;NQSMM=0.04,0.0;SC=2,3,0,2   GT      1/0
+	chr1    714048  .       T       TCAAC   .       PASS    AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.266666,21.932203;NQSMM=0.0,0.15254237;SC=3,0,3,3 GT      0/1
+	chr1    714049  .       G       GC      .       PASS    AC=3,3;DP=9;MM=3.0,7.1666665;MQ=1.0,10.833333;NQSBQ=23.233334,21.83051;NQSMM=0.0,0.15254237;SC=3,0,3,3  GT      0/1
+	chr1    813675  .       A       AATAG   .       PASS    AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=25.74,25.166666;NQSMM=0.0,0.033333335;SC=4,1,1,2       GT      0/1
+	chr1    813687  .       AGAGAGAGAGAAG   A       .       PASS    AC=5,5;DP=8;MM=0.4,1.0;MQ=5.0,67.0;NQSBQ=24.54,25.2;NQSMM=0.02,0.06666667;SC=4,1,1,2    GT      1/0
+=back
+The code was written by Dr. Kai Wang and modified by Dr. Germ�n Gast�n Leparc.
+Various users have provided sample input files for many SNP callin software, for
+the development of conversion subroutines. We thank these users for their
+continued support to improve the functionality of the script.
+For questions or comments, please contact kai@openbioinformatics.org.
+=cut

Mercurial > repos > rdaveau > gfap

comparison gfapts/inc/annovar/convert2annovar.pl @ 0:f753b30013e6 draft