1
|
1
|
|
2 #!/usr/bin/perl
|
|
3
|
|
4 use strict;
|
|
5 use Getopt::Long;
|
|
6
|
|
7 my $usage = qq~Usage:$0 <args> [<opts>]
|
|
8
|
|
9 where <args> are:
|
|
10
|
|
11 -v, --vcf <VCF input>
|
|
12 -o, --out <Output basename>
|
|
13
|
|
14 <opts> are:
|
|
15
|
|
16 -r, --reference <Reference fasta file>
|
|
17 -g, --gff <GFF input file to create alignments of genes>
|
|
18 ~;
|
|
19 $usage .= "\n";
|
|
20
|
|
21 my ($input,$out,$reference,$gff);
|
|
22
|
|
23
|
|
24
|
|
25 GetOptions(
|
|
26 "vcf=s" => \$input,
|
|
27 "out=s" => \$out,
|
|
28 "reference=s" => \$reference,
|
|
29 "gff=s" => \$gff
|
|
30 );
|
|
31
|
|
32
|
|
33 die $usage
|
|
34 if ( !$input || !$out);
|
|
35
|
|
36 if ($gff && !$reference)
|
|
37 {
|
|
38 die "You must provide a Fasta reference file when providing GFF annotation\n";
|
|
39 }
|
|
40
|
|
41
|
|
42 my %ref_sequences;
|
|
43 if ($reference)
|
|
44 {
|
|
45 my $id;
|
|
46 my $sequence = "";
|
|
47 open(my $R,$reference) or die "cannot open file: $reference";
|
|
48 while(<$R>)
|
|
49 {
|
|
50 my $line =$_;
|
|
51 $line =~s/\n//g;
|
|
52 $line =~s/\r//g;
|
|
53 if ($line =~ />([^\s]+)/){
|
|
54 $ref_sequences{$id} = $sequence;
|
|
55 $id=$1;$sequence="";
|
|
56 }
|
|
57 else
|
|
58 {
|
|
59 $sequence .= $line;
|
|
60 }
|
|
61 }
|
|
62 close($R);
|
|
63 $ref_sequences{$id} = $sequence;
|
|
64 }
|
|
65
|
|
66
|
|
67 my %chr_of_gene;
|
|
68 my %ann;
|
|
69 if ($gff)
|
|
70 {
|
|
71 open(my $G,$gff) or die "cannot open file: $gff";
|
|
72 while(<$G>)
|
|
73 {
|
|
74 my $line =$_;
|
|
75 $line =~s/\n//g;
|
|
76 $line =~s/\r//g;
|
|
77 my @i = split(/\t/,$line);
|
|
78 my $chr = $i[0];
|
|
79 my $feature = $i[2];
|
|
80 my $strand = $i[6];
|
|
81 my $start = $i[3];
|
|
82 my $stop = $i[4];
|
|
83 my $inf = $i[8];
|
|
84 if ($feature eq 'gene')
|
|
85 {
|
|
86 if ($inf =~/Name=([\w\-\.]+)[;\s]*/){$inf = $1;}
|
|
87 $ann{$inf}{"start"}=$start;
|
|
88 $ann{$inf}{"stop"}=$stop;
|
|
89 $ann{$inf}{"strand"}=$strand;
|
|
90 $chr_of_gene{$inf} = $chr;
|
|
91 }
|
|
92 }
|
|
93 close($G);
|
|
94 }
|
|
95
|
|
96
|
|
97
|
|
98 my %IUPAC =
|
|
99 (
|
|
100 '[A/G]'=> "R",
|
|
101 '[G/A]'=> "R",
|
|
102 '[C/T]'=> "Y",
|
|
103 '[T/C]'=> "Y",
|
|
104 '[T/G]'=> "K",
|
|
105 '[G/T]'=> "K",
|
|
106 '[C/G]'=> "S",
|
|
107 '[G/C]'=> "S",
|
|
108 '[A/T]'=> "W",
|
|
109 '[T/A]'=> "W",
|
|
110 '[A/C]'=> "M",
|
|
111 '[C/A]'=> "M",
|
|
112 '[C/A/T]'=> "H",
|
|
113 '[A/T/C]'=> "H",
|
|
114 '[A/C/T]'=> "H",
|
|
115 '[C/T/A]'=> "H",
|
|
116 '[T/C/A]'=> "H",
|
|
117 '[T/A/C]'=> "H",
|
|
118 '[C/A/G]'=> "V",
|
|
119 '[A/G/C]'=> "V",
|
|
120 '[A/C/G]'=> "V",
|
|
121 '[C/G/A]'=> "V",
|
|
122 '[G/C/A]'=> "V",
|
|
123 '[G/A/C]'=> "V",
|
|
124 '[C/T/G]'=> "B",
|
|
125 '[T/G/C]'=> "B",
|
|
126 '[T/C/G]'=> "B",
|
|
127 '[C/G/T]'=> "B",
|
|
128 '[G/C/T]'=> "B",
|
|
129 '[G/T/C]'=> "B",
|
|
130 '[T/A/G]'=> "D",
|
|
131 '[A/G/T]'=> "D",
|
|
132 '[A/T/G]'=> "D",
|
|
133 '[T/G/A]'=> "D",
|
|
134 '[G/T/A]'=> "D",
|
|
135 '[G/A/T]'=> "D",
|
|
136 );
|
|
137
|
|
138 my %snps_of_gene;
|
|
139 my %snps_of_gene2;
|
|
140 my %indiv_order;
|
|
141 my $indiv_list;
|
|
142 my %genotyping_infos;
|
|
143 my $num_line = 0;
|
|
144 my $genename_rank_in_snpeff = 4;
|
|
145
|
|
146 my $find_annotations = `grep -c 'EFF=' $input`;
|
|
147
|
|
148 open(my $HAPMAP,">$out.hapmap");
|
|
149 print $HAPMAP "rs# alleles chrom pos gene feature effect codon_change amino_acid_change MAF missing_data";
|
|
150 open(my $VCF,$input);
|
|
151 while(<$VCF>)
|
|
152 {
|
|
153 my $line = $_;
|
|
154 chomp($line);
|
|
155 my @infos = split(/\t/,$line);
|
|
156
|
|
157 if (/^##INFO=\<ID=EFF/ && /Amino_Acid_length \| Gene_Name \| Transcript_BioType \| Gene_Coding/)
|
|
158 {
|
|
159 $genename_rank_in_snpeff = 8;
|
|
160 }
|
|
161
|
|
162 if (scalar @infos > 9)
|
|
163 {
|
|
164 if (/#CHROM/)
|
|
165 {
|
|
166 for (my $j=9;$j<=$#infos;$j++)
|
|
167 {
|
|
168 my $individu = $infos[$j];
|
|
169 $indiv_list .= " $individu";
|
|
170 $indiv_order{$j} = $individu;
|
|
171 }
|
|
172 print $HAPMAP "$indiv_list\n";
|
|
173 }
|
|
174 elsif (!/^#/)
|
|
175 {
|
|
176 $num_line++;
|
|
177
|
|
178 my $chromosome = $infos[0];
|
|
179 my $chromosome_position = $infos[1];
|
|
180 my $ref_allele = $infos[3];
|
|
181 my $alt_allele = $infos[4];
|
|
182
|
|
183 if ($ref_allele =~/\w\w+/)
|
|
184 {
|
|
185 $ref_allele = "A";
|
|
186 $alt_allele = "T";
|
|
187 }
|
|
188 elsif ($alt_allele =~/\w\w+/)
|
|
189 {
|
|
190 $ref_allele = "T";
|
|
191 $alt_allele = "A";
|
|
192 }
|
|
193
|
|
194 my $info = $infos[7];
|
|
195 my $is_in_exon = "#";
|
|
196 my $is_synonyme = "#";
|
|
197 my $gene;
|
|
198 if ($find_annotations > 1)
|
|
199 {
|
|
200 $gene = "intergenic";
|
|
201 }
|
|
202 else
|
|
203 {
|
|
204 $gene = $chromosome;
|
|
205 }
|
|
206 my $modif_codon = "#";
|
|
207 my $modif_aa = "#";
|
|
208 my $geneposition;
|
|
209 if ($info =~/EFF=(.*)/)
|
|
210 {
|
|
211 my @annotations = split(",",$1);
|
|
212 foreach my $annotation(@annotations)
|
|
213 {
|
|
214 my ($syn, $additional) = split(/\(/,$annotation);
|
|
215
|
|
216 if ($syn =~/STREAM/){next;}
|
|
217
|
|
218 $is_in_exon = "exon";
|
|
219 if ($syn =~/UTR/)
|
|
220 {
|
|
221 $is_in_exon = $syn;
|
|
222 }
|
|
223 else
|
|
224 {
|
|
225 $is_synonyme = $syn;
|
|
226 }
|
|
227 my @infos_additional = split(/\|/,$additional);
|
|
228 $gene = $infos_additional[$genename_rank_in_snpeff];
|
|
229 $modif_codon = $infos_additional[2];
|
|
230 $modif_aa = $infos_additional[3];
|
|
231
|
|
232 if ($syn =~/INTERGENIC/)
|
|
233 {
|
|
234 $is_synonyme = "#";
|
|
235 $gene = "intergenic";
|
|
236 $is_in_exon = "#";
|
|
237 }
|
|
238 elsif ($syn =~/SYNONYM/)
|
|
239 {
|
|
240 $is_in_exon = "exon";
|
|
241 }
|
|
242 elsif ($syn =~/INTRON/ or $syn =~/SPLICE_SITE_DONOR/)
|
|
243 {
|
|
244 $is_in_exon = "intron";
|
|
245 $is_synonyme = "#";
|
|
246 }
|
|
247
|
|
248
|
|
249 if ($modif_aa =~/(\w)(\d+)$/)
|
|
250 {
|
|
251 $modif_aa = "$1/$1";
|
|
252 }
|
|
253 elsif ($modif_aa =~/(\w)(\d+)(\w)/)
|
|
254 {
|
|
255 $modif_aa = "$1/$3";
|
|
256 }
|
|
257 if ($infos_additional[8] =~/Exon/)
|
|
258 {
|
|
259 $is_in_exon = "exon";
|
|
260 }
|
|
261
|
|
262 if (!$modif_aa){$modif_aa="#";}
|
|
263 if (!$modif_codon){$modif_codon="#";}
|
|
264 }
|
|
265 }
|
|
266 $gene =~s/\.\d//g;
|
|
267 if ($ann{$gene}{"start"})
|
|
268 {
|
|
269 my $strand = $ann{$gene}{"strand"};
|
|
270 if ($strand eq '-')
|
|
271 {
|
|
272 $geneposition = $ann{$gene}{"stop"} - $chromosome_position;
|
|
273 }
|
|
274 else
|
|
275 {
|
|
276 $geneposition = $chromosome_position - $ann{$gene}{"start"};
|
|
277 }
|
|
278 }
|
|
279 #if ($info =~/GenePos=(\d+);/)
|
|
280 #{
|
|
281 # $geneposition = $1;
|
|
282 #}
|
|
283 my $ratio_missing_data;
|
|
284 my $snp_frequency;
|
|
285 my $genotyping = "";
|
|
286
|
|
287 if (2 > 1)
|
|
288 {
|
|
289
|
|
290 $genotyping_infos{"ref"} = "$ref_allele$ref_allele";
|
|
291
|
|
292 my %alleles_found;
|
|
293 my $nb_readable_ind = 0;
|
|
294 for (my $i = 9;$i <= $#infos;$i++)
|
|
295 {
|
|
296 my $dnasample = $indiv_order{$i};
|
|
297 my @infos_alleles = split(":",$infos[$i]);
|
|
298 my $genotype = $infos_alleles[0];
|
|
299 $genotype =~s/0/$ref_allele/g;
|
|
300
|
|
301 if ($alt_allele =~/,/)
|
|
302 {
|
|
303 my @alt_alleles = split(",",$alt_allele);
|
|
304 my $num_all = 1;
|
|
305 foreach my $alt_al(@alt_alleles)
|
|
306 {
|
|
307 $genotype =~s/$num_all/$alt_al/g;
|
|
308 $num_all++;
|
|
309 }
|
|
310 }
|
|
311 else
|
|
312 {
|
|
313 $genotype =~s/1/$alt_allele/g;
|
|
314 }
|
|
315 if ($genotype eq '.'){$genotype = "./.";}
|
|
316 $genotype =~s/\./N/g;
|
|
317 if ($genotype !~/N\/N/)
|
|
318 {
|
|
319 $nb_readable_ind++;
|
|
320 }
|
|
321 my @alleles;
|
|
322 if ($genotype =~/\//)
|
|
323 {
|
|
324 @alleles = split(/\//,$genotype);
|
|
325 }
|
|
326 else
|
|
327 {
|
|
328 @alleles = split(/\|/,$genotype);
|
|
329 }
|
|
330 $genotyping .= join("",@alleles) . " ";
|
|
331 $genotyping_infos{$dnasample} = join("",@alleles);
|
|
332
|
|
333 foreach my $al(@alleles)
|
|
334 {
|
|
335 if ($al ne 'N'){$alleles_found{$al}++;}
|
|
336 }
|
|
337 }
|
|
338 chop($genotyping);
|
|
339
|
|
340 $snp_frequency = 0;
|
|
341
|
|
342 my $max = 0;
|
|
343 my $min = 10000000;
|
|
344 my $total = 0;
|
|
345 foreach my $al(keys(%alleles_found))
|
|
346 {
|
|
347 my $nb = $alleles_found{$al};
|
|
348 $total+= $nb;
|
|
349 if ($nb > $max)
|
|
350 {
|
|
351 $max = $nb;
|
|
352 }
|
|
353 if ($nb < $min)
|
|
354 {
|
|
355 $min = $nb;
|
|
356 }
|
|
357 }
|
|
358 if ($total > 0)
|
|
359 {
|
|
360 $snp_frequency = sprintf("%.1f",($min/$total)*100);
|
|
361 }
|
|
362
|
|
363 $ratio_missing_data = 100 - ($nb_readable_ind / ($#infos - 8)) * 100;
|
|
364 $ratio_missing_data = sprintf("%.1f",$ratio_missing_data);
|
|
365
|
|
366 foreach my $dna(keys(%genotyping_infos))
|
|
367 {
|
|
368 $snps_of_gene{$gene}{$geneposition}{$dna} = $genotyping_infos{$dna};
|
|
369 }
|
|
370 }
|
|
371 my $snp_type = "[$ref_allele/$alt_allele]";
|
|
372 $snps_of_gene2{$chromosome}{$chromosome_position} = $snp_type;
|
|
373 #print $HAPMAP "$chromosome:$chromosome_position\t$snp_type\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$nb_readable_ind\t$genotyping\n";
|
|
374 print $HAPMAP "$chromosome:$chromosome_position\t$ref_allele/$alt_allele\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$ratio_missing_data%\t$genotyping\n";
|
|
375 }
|
|
376 }
|
|
377 }
|
|
378 close($VCF);
|
|
379 close($HAPMAP);
|
|
380
|
|
381
|
|
382 if (!$reference){exit;}
|
|
383
|
|
384 #################################################################
|
|
385 # generate flanking sequences for Illumina VeraCode technology
|
|
386 #################################################################
|
|
387 open(my $FLANKING,">$out.flanking.txt");
|
|
388 foreach my $seq(keys(%ref_sequences))
|
|
389 {
|
|
390 if ($snps_of_gene2{$seq})
|
|
391 {
|
|
392 my $refhash = $snps_of_gene2{$seq};
|
|
393 my %hashreal = %$refhash;
|
|
394
|
|
395 # create consensus
|
|
396 my $refseq = $ref_sequences{$seq};
|
|
397 my $consensus = "";
|
|
398 my $previous = 0;
|
|
399 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
|
|
400 {
|
|
401 my $length = $pos - $previous - 1;
|
|
402 $consensus .= substr($refseq,$previous,$length);
|
|
403 my $iupac_code = $IUPAC{$snps_of_gene2{$seq}{$pos}};
|
|
404 $consensus .= $iupac_code;
|
|
405 $previous = $pos;
|
|
406 }
|
|
407 my $length = length($refseq) - $previous;
|
|
408 $consensus .= substr($refseq,$previous,$length);
|
|
409
|
|
410 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
|
|
411 {
|
|
412 my $snp_name = "$seq-$pos";
|
|
413 my $flanking_length = 60;
|
|
414 my $length = $flanking_length;
|
|
415 my $start = $pos - $flanking_length - 1;
|
|
416 if ($pos <= $flanking_length)
|
|
417 {
|
|
418 $length = $pos - 1;
|
|
419 $start = 0;
|
|
420 }
|
|
421 my $sequence = substr($consensus,$start,$length);
|
|
422
|
|
423 $sequence .= $snps_of_gene2{$seq}{$pos};
|
|
424 $sequence .= substr($consensus,$pos,$flanking_length);
|
|
425 print $FLANKING "$snp_name,$sequence,0,0,0,Project_name,0,diploid,Other,Forward\n";
|
|
426 }
|
|
427 }
|
|
428 }
|
|
429
|
|
430 close($FLANKING);
|
|
431
|
|
432 if (!$gff){exit;}
|
|
433
|
|
434 my @individuals_list = split(/\t/,$indiv_list);
|
|
435 if ((scalar @individuals_list * scalar keys(%snps_of_gene)) > 800000)
|
|
436 {
|
|
437 print "Sorry, too many sequences to manage ...\n";
|
|
438 exit;
|
|
439 }
|
|
440
|
|
441 open(my $ALIGN_EGGLIB,">$out.gene_alignment.fas");
|
|
442 my %alignments_ind;
|
|
443 foreach my $seq(keys(%snps_of_gene))
|
|
444 {
|
|
445 if ($snps_of_gene{$seq})
|
|
446 {
|
|
447 my $refhash = $snps_of_gene{$seq};
|
|
448 my %hashreal = %$refhash;
|
|
449
|
|
450 # get flanking sequences
|
|
451 my %flanking5;
|
|
452 my $start = $ann{$seq}{"start"};
|
|
453 my $stop = $ann{$seq}{"stop"};
|
|
454 my $strand = $ann{$seq}{"strand"};
|
|
455 my $genelength = $stop - $start+1;
|
|
456 my $chr = $chr_of_gene{$seq};
|
|
457 my $refseq = substr($ref_sequences{$chr},$start-1,$genelength);
|
|
458 if ($strand eq '-')
|
|
459 {
|
|
460 $refseq =~ tr /atcgATCG/tagcTAGC/; $refseq = reverse($refseq);
|
|
461 }
|
|
462 #print "$seq $chr $start $stop $refseq \n";
|
|
463 my $previous = 0;
|
|
464 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
|
|
465 {
|
|
466 my $length = $pos - $previous - 1;
|
|
467 $flanking5{$pos} = substr($refseq,$previous,$length);
|
|
468 $previous = $pos;
|
|
469 }
|
|
470 my $length = length($refseq) - $previous;
|
|
471 my $flanking3 = substr($refseq,$previous,$length);
|
|
472 foreach my $ind(@individuals_list)
|
|
473 {
|
|
474 my $nb_missing_data_for_this_individual = 0;
|
|
475 if ($ind)
|
|
476 {
|
|
477 my $alignment_for_ind = "";
|
|
478 my $seq_without_underscore = $seq;
|
|
479 $seq_without_underscore =~s/_//g;
|
|
480 $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_1\n";
|
|
481 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
|
|
482 {
|
|
483 $alignment_for_ind .= $flanking5{$pos};
|
|
484 my $geno = $snps_of_gene{$seq}{$pos}{$ind};
|
|
485 $geno =~s/N/?/g;
|
|
486 if ($geno =~/\?/){$nb_missing_data_for_this_individual++;}
|
|
487 my @alleles = split("",$geno);
|
|
488 $alignment_for_ind .= $alleles[0];
|
|
489 if ($alleles[0] eq $alleles[1])
|
|
490 {
|
|
491 $alignments_ind{$ind} .= $alleles[1];
|
|
492 }
|
|
493 else
|
|
494 {
|
|
495 my $snp_type = "[" . $alleles[0] . "/" . $alleles[1] . "]";
|
|
496 $alignments_ind{$ind} .= $IUPAC{$snp_type};
|
|
497 }
|
|
498 }
|
|
499 $alignment_for_ind .= $flanking3;
|
|
500 $alignment_for_ind .= "\n";
|
|
501
|
|
502
|
|
503 $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_2\n";
|
|
504 foreach my $pos(sort {$a<=>$b} keys(%hashreal))
|
|
505 {
|
|
506 $alignment_for_ind .= $flanking5{$pos};
|
|
507 my $geno = $snps_of_gene{$seq}{$pos}{$ind};
|
|
508 $geno =~s/N/?/g;
|
|
509 my @alleles = split("",$geno);
|
|
510 $alignment_for_ind .= $alleles[1];
|
|
511 }
|
|
512 $alignment_for_ind .= $flanking3;
|
|
513 $alignment_for_ind .= "\n";
|
|
514 if (keys(%hashreal) != $nb_missing_data_for_this_individual)
|
|
515 {
|
|
516 print $ALIGN_EGGLIB $alignment_for_ind;
|
|
517 }
|
|
518 }
|
|
519 }
|
|
520 }
|
|
521 }
|
|
522 close($ALIGN_EGGLIB);
|
|
523
|
|
524
|
|
525
|