Mercurial > repos > dereeper > sniplay
diff VCF2Hapmap/VCF2FastaAndHapmap.pl @ 3:345f88a8f483 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 10 Jul 2015 10:38:43 -0400 |
parents | 420b57c3c185 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/VCF2Hapmap/VCF2FastaAndHapmap.pl Fri Jul 10 10:38:43 2015 -0400 @@ -0,0 +1,525 @@ + +#!/usr/bin/perl + +use strict; +use Getopt::Long; + +my $usage = qq~Usage:$0 <args> [<opts>] + +where <args> are: + + -v, --vcf <VCF input> + -o, --out <Output basename> + +<opts> are: + + -r, --reference <Reference fasta file> + -g, --gff <GFF input file to create alignments of genes> +~; +$usage .= "\n"; + +my ($input,$out,$reference,$gff); + + + +GetOptions( + "vcf=s" => \$input, + "out=s" => \$out, + "reference=s" => \$reference, + "gff=s" => \$gff +); + + +die $usage + if ( !$input || !$out); + +if ($gff && !$reference) +{ + die "You must provide a Fasta reference file when providing GFF annotation\n"; +} + + +my %ref_sequences; +if ($reference) +{ + my $id; + my $sequence = ""; + open(my $R,$reference) or die "cannot open file: $reference"; + while(<$R>) + { + my $line =$_; + $line =~s/\n//g; + $line =~s/\r//g; + if ($line =~ />([^\s]+)/){ + $ref_sequences{$id} = $sequence; + $id=$1;$sequence=""; + } + else + { + $sequence .= $line; + } + } + close($R); + $ref_sequences{$id} = $sequence; +} + + +my %chr_of_gene; +my %ann; +if ($gff) +{ + open(my $G,$gff) or die "cannot open file: $gff"; + while(<$G>) + { + my $line =$_; + $line =~s/\n//g; + $line =~s/\r//g; + my @i = split(/\t/,$line); + my $chr = $i[0]; + my $feature = $i[2]; + my $strand = $i[6]; + my $start = $i[3]; + my $stop = $i[4]; + my $inf = $i[8]; + if ($feature eq 'gene') + { + if ($inf =~/Name=([\w\-\.]+)[;\s]*/){$inf = $1;} + $ann{$inf}{"start"}=$start; + $ann{$inf}{"stop"}=$stop; + $ann{$inf}{"strand"}=$strand; + $chr_of_gene{$inf} = $chr; + } + } + close($G); +} + + + +my %IUPAC = +( + '[A/G]'=> "R", + '[G/A]'=> "R", + '[C/T]'=> "Y", + '[T/C]'=> "Y", + '[T/G]'=> "K", + '[G/T]'=> "K", + '[C/G]'=> "S", + '[G/C]'=> "S", + '[A/T]'=> "W", + '[T/A]'=> "W", + '[A/C]'=> "M", + '[C/A]'=> "M", + '[C/A/T]'=> "H", + '[A/T/C]'=> "H", + '[A/C/T]'=> "H", + '[C/T/A]'=> "H", + '[T/C/A]'=> "H", + '[T/A/C]'=> "H", + '[C/A/G]'=> "V", + '[A/G/C]'=> "V", + '[A/C/G]'=> "V", + '[C/G/A]'=> "V", + '[G/C/A]'=> "V", + '[G/A/C]'=> "V", + '[C/T/G]'=> "B", + '[T/G/C]'=> "B", + '[T/C/G]'=> "B", + '[C/G/T]'=> "B", + '[G/C/T]'=> "B", + '[G/T/C]'=> "B", + '[T/A/G]'=> "D", + '[A/G/T]'=> "D", + '[A/T/G]'=> "D", + '[T/G/A]'=> "D", + '[G/T/A]'=> "D", + '[G/A/T]'=> "D", +); + +my %snps_of_gene; +my %snps_of_gene2; +my %indiv_order; +my $indiv_list; +my %genotyping_infos; +my $num_line = 0; +my $genename_rank_in_snpeff = 4; + +my $find_annotations = `grep -c 'EFF=' $input`; + +open(my $HAPMAP,">$out.hapmap"); +print $HAPMAP "rs# alleles chrom pos gene feature effect codon_change amino_acid_change MAF missing_data"; +open(my $VCF,$input); +while(<$VCF>) +{ + my $line = $_; + chomp($line); + my @infos = split(/\t/,$line); + + if (/^##INFO=\<ID=EFF/ && /Amino_Acid_length \| Gene_Name \| Transcript_BioType \| Gene_Coding/) + { + $genename_rank_in_snpeff = 8; + } + + if (scalar @infos > 9) + { + if (/#CHROM/) + { + for (my $j=9;$j<=$#infos;$j++) + { + my $individu = $infos[$j]; + $indiv_list .= " $individu"; + $indiv_order{$j} = $individu; + } + print $HAPMAP "$indiv_list\n"; + } + elsif (!/^#/) + { + $num_line++; + + my $chromosome = $infos[0]; + my $chromosome_position = $infos[1]; + my $ref_allele = $infos[3]; + my $alt_allele = $infos[4]; + + if ($ref_allele =~/\w\w+/) + { + $ref_allele = "A"; + $alt_allele = "T"; + } + elsif ($alt_allele =~/\w\w+/) + { + $ref_allele = "T"; + $alt_allele = "A"; + } + + my $info = $infos[7]; + my $is_in_exon = "#"; + my $is_synonyme = "#"; + my $gene; + if ($find_annotations > 1) + { + $gene = "intergenic"; + } + else + { + $gene = $chromosome; + } + my $modif_codon = "#"; + my $modif_aa = "#"; + my $geneposition; + if ($info =~/EFF=(.*)/) + { + my @annotations = split(",",$1); + foreach my $annotation(@annotations) + { + my ($syn, $additional) = split(/\(/,$annotation); + + if ($syn =~/STREAM/){next;} + + $is_in_exon = "exon"; + if ($syn =~/UTR/) + { + $is_in_exon = $syn; + } + else + { + $is_synonyme = $syn; + } + my @infos_additional = split(/\|/,$additional); + $gene = $infos_additional[$genename_rank_in_snpeff]; + $modif_codon = $infos_additional[2]; + $modif_aa = $infos_additional[3]; + + if ($syn =~/INTERGENIC/) + { + $is_synonyme = "#"; + $gene = "intergenic"; + $is_in_exon = "#"; + } + elsif ($syn =~/SYNONYM/) + { + $is_in_exon = "exon"; + } + elsif ($syn =~/INTRON/ or $syn =~/SPLICE_SITE_DONOR/) + { + $is_in_exon = "intron"; + $is_synonyme = "#"; + } + + + if ($modif_aa =~/(\w)(\d+)$/) + { + $modif_aa = "$1/$1"; + } + elsif ($modif_aa =~/(\w)(\d+)(\w)/) + { + $modif_aa = "$1/$3"; + } + if ($infos_additional[8] =~/Exon/) + { + $is_in_exon = "exon"; + } + + if (!$modif_aa){$modif_aa="#";} + if (!$modif_codon){$modif_codon="#";} + } + } + $gene =~s/\.\d//g; + if ($ann{$gene}{"start"}) + { + my $strand = $ann{$gene}{"strand"}; + if ($strand eq '-') + { + $geneposition = $ann{$gene}{"stop"} - $chromosome_position; + } + else + { + $geneposition = $chromosome_position - $ann{$gene}{"start"}; + } + } + #if ($info =~/GenePos=(\d+);/) + #{ + # $geneposition = $1; + #} + my $ratio_missing_data; + my $snp_frequency; + my $genotyping = ""; + + if (2 > 1) + { + + $genotyping_infos{"ref"} = "$ref_allele$ref_allele"; + + my %alleles_found; + my $nb_readable_ind = 0; + for (my $i = 9;$i <= $#infos;$i++) + { + my $dnasample = $indiv_order{$i}; + my @infos_alleles = split(":",$infos[$i]); + my $genotype = $infos_alleles[0]; + $genotype =~s/0/$ref_allele/g; + + if ($alt_allele =~/,/) + { + my @alt_alleles = split(",",$alt_allele); + my $num_all = 1; + foreach my $alt_al(@alt_alleles) + { + $genotype =~s/$num_all/$alt_al/g; + $num_all++; + } + } + else + { + $genotype =~s/1/$alt_allele/g; + } + if ($genotype eq '.'){$genotype = "./.";} + $genotype =~s/\./N/g; + if ($genotype !~/N\/N/) + { + $nb_readable_ind++; + } + my @alleles; + if ($genotype =~/\//) + { + @alleles = split(/\//,$genotype); + } + else + { + @alleles = split(/\|/,$genotype); + } + $genotyping .= join("",@alleles) . " "; + $genotyping_infos{$dnasample} = join("",@alleles); + + foreach my $al(@alleles) + { + if ($al ne 'N'){$alleles_found{$al}++;} + } + } + chop($genotyping); + + $snp_frequency = 0; + + my $max = 0; + my $min = 10000000; + my $total = 0; + foreach my $al(keys(%alleles_found)) + { + my $nb = $alleles_found{$al}; + $total+= $nb; + if ($nb > $max) + { + $max = $nb; + } + if ($nb < $min) + { + $min = $nb; + } + } + if ($total > 0) + { + $snp_frequency = sprintf("%.1f",($min/$total)*100); + } + + $ratio_missing_data = 100 - ($nb_readable_ind / ($#infos - 8)) * 100; + $ratio_missing_data = sprintf("%.1f",$ratio_missing_data); + + foreach my $dna(keys(%genotyping_infos)) + { + $snps_of_gene{$gene}{$geneposition}{$dna} = $genotyping_infos{$dna}; + } + } + my $snp_type = "[$ref_allele/$alt_allele]"; + $snps_of_gene2{$chromosome}{$chromosome_position} = $snp_type; + #print $HAPMAP "$chromosome:$chromosome_position\t$snp_type\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$nb_readable_ind\t$genotyping\n"; + print $HAPMAP "$chromosome:$chromosome_position\t$ref_allele/$alt_allele\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$ratio_missing_data%\t$genotyping\n"; + } + } +} +close($VCF); +close($HAPMAP); + + +if (!$reference){exit;} + +################################################################# +# generate flanking sequences for Illumina VeraCode technology +################################################################# +open(my $FLANKING,">$out.flanking.txt"); +foreach my $seq(keys(%ref_sequences)) +{ + if ($snps_of_gene2{$seq}) + { + my $refhash = $snps_of_gene2{$seq}; + my %hashreal = %$refhash; + + # create consensus + my $refseq = $ref_sequences{$seq}; + my $consensus = ""; + my $previous = 0; + foreach my $pos(sort {$a<=>$b} keys(%hashreal)) + { + my $length = $pos - $previous - 1; + $consensus .= substr($refseq,$previous,$length); + my $iupac_code = $IUPAC{$snps_of_gene2{$seq}{$pos}}; + $consensus .= $iupac_code; + $previous = $pos; + } + my $length = length($refseq) - $previous; + $consensus .= substr($refseq,$previous,$length); + + foreach my $pos(sort {$a<=>$b} keys(%hashreal)) + { + my $snp_name = "$seq-$pos"; + my $flanking_length = 60; + my $length = $flanking_length; + my $start = $pos - $flanking_length - 1; + if ($pos <= $flanking_length) + { + $length = $pos - 1; + $start = 0; + } + my $sequence = substr($consensus,$start,$length); + + $sequence .= $snps_of_gene2{$seq}{$pos}; + $sequence .= substr($consensus,$pos,$flanking_length); + print $FLANKING "$snp_name,$sequence,0,0,0,Project_name,0,diploid,Other,Forward\n"; + } + } + } + + close($FLANKING); + + if (!$gff){exit;} + + my @individuals_list = split(/\t/,$indiv_list); + if ((scalar @individuals_list * scalar keys(%snps_of_gene)) > 800000) + { + print "Sorry, too many sequences to manage ...\n"; + exit; + } + + open(my $ALIGN_EGGLIB,">$out.gene_alignment.fas"); + my %alignments_ind; + foreach my $seq(keys(%snps_of_gene)) + { + if ($snps_of_gene{$seq}) + { + my $refhash = $snps_of_gene{$seq}; + my %hashreal = %$refhash; + + # get flanking sequences + my %flanking5; + my $start = $ann{$seq}{"start"}; + my $stop = $ann{$seq}{"stop"}; + my $strand = $ann{$seq}{"strand"}; + my $genelength = $stop - $start+1; + my $chr = $chr_of_gene{$seq}; + my $refseq = substr($ref_sequences{$chr},$start-1,$genelength); + if ($strand eq '-') + { + $refseq =~ tr /atcgATCG/tagcTAGC/; $refseq = reverse($refseq); + } + #print "$seq $chr $start $stop $refseq \n"; + my $previous = 0; + foreach my $pos(sort {$a<=>$b} keys(%hashreal)) + { + my $length = $pos - $previous - 1; + $flanking5{$pos} = substr($refseq,$previous,$length); + $previous = $pos; + } + my $length = length($refseq) - $previous; + my $flanking3 = substr($refseq,$previous,$length); + foreach my $ind(@individuals_list) + { + my $nb_missing_data_for_this_individual = 0; + if ($ind) + { + my $alignment_for_ind = ""; + my $seq_without_underscore = $seq; + $seq_without_underscore =~s/_//g; + $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_1\n"; + foreach my $pos(sort {$a<=>$b} keys(%hashreal)) + { + $alignment_for_ind .= $flanking5{$pos}; + my $geno = $snps_of_gene{$seq}{$pos}{$ind}; + $geno =~s/N/?/g; + if ($geno =~/\?/){$nb_missing_data_for_this_individual++;} + my @alleles = split("",$geno); + $alignment_for_ind .= $alleles[0]; + if ($alleles[0] eq $alleles[1]) + { + $alignments_ind{$ind} .= $alleles[1]; + } + else + { + my $snp_type = "[" . $alleles[0] . "/" . $alleles[1] . "]"; + $alignments_ind{$ind} .= $IUPAC{$snp_type}; + } + } + $alignment_for_ind .= $flanking3; + $alignment_for_ind .= "\n"; + + + $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_2\n"; + foreach my $pos(sort {$a<=>$b} keys(%hashreal)) + { + $alignment_for_ind .= $flanking5{$pos}; + my $geno = $snps_of_gene{$seq}{$pos}{$ind}; + $geno =~s/N/?/g; + my @alleles = split("",$geno); + $alignment_for_ind .= $alleles[1]; + } + $alignment_for_ind .= $flanking3; + $alignment_for_ind .= "\n"; + if (keys(%hashreal) != $nb_missing_data_for_this_individual) + { + print $ALIGN_EGGLIB $alignment_for_ind; + } + } + } + } +} +close($ALIGN_EGGLIB); + + +