Mercurial > repos > dereeper > sniplay
comparison VCF2Hapmap/VCF2FastaAndHapmap.pl @ 1:420b57c3c185 draft
Uploaded
author | dereeper |
---|---|
date | Fri, 10 Jul 2015 04:39:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:3e19d0dfcf3e | 1:420b57c3c185 |
---|---|
1 | |
2 #!/usr/bin/perl | |
3 | |
4 use strict; | |
5 use Getopt::Long; | |
6 | |
7 my $usage = qq~Usage:$0 <args> [<opts>] | |
8 | |
9 where <args> are: | |
10 | |
11 -v, --vcf <VCF input> | |
12 -o, --out <Output basename> | |
13 | |
14 <opts> are: | |
15 | |
16 -r, --reference <Reference fasta file> | |
17 -g, --gff <GFF input file to create alignments of genes> | |
18 ~; | |
19 $usage .= "\n"; | |
20 | |
21 my ($input,$out,$reference,$gff); | |
22 | |
23 | |
24 | |
25 GetOptions( | |
26 "vcf=s" => \$input, | |
27 "out=s" => \$out, | |
28 "reference=s" => \$reference, | |
29 "gff=s" => \$gff | |
30 ); | |
31 | |
32 | |
33 die $usage | |
34 if ( !$input || !$out); | |
35 | |
36 if ($gff && !$reference) | |
37 { | |
38 die "You must provide a Fasta reference file when providing GFF annotation\n"; | |
39 } | |
40 | |
41 | |
42 my %ref_sequences; | |
43 if ($reference) | |
44 { | |
45 my $id; | |
46 my $sequence = ""; | |
47 open(my $R,$reference) or die "cannot open file: $reference"; | |
48 while(<$R>) | |
49 { | |
50 my $line =$_; | |
51 $line =~s/\n//g; | |
52 $line =~s/\r//g; | |
53 if ($line =~ />([^\s]+)/){ | |
54 $ref_sequences{$id} = $sequence; | |
55 $id=$1;$sequence=""; | |
56 } | |
57 else | |
58 { | |
59 $sequence .= $line; | |
60 } | |
61 } | |
62 close($R); | |
63 $ref_sequences{$id} = $sequence; | |
64 } | |
65 | |
66 | |
67 my %chr_of_gene; | |
68 my %ann; | |
69 if ($gff) | |
70 { | |
71 open(my $G,$gff) or die "cannot open file: $gff"; | |
72 while(<$G>) | |
73 { | |
74 my $line =$_; | |
75 $line =~s/\n//g; | |
76 $line =~s/\r//g; | |
77 my @i = split(/\t/,$line); | |
78 my $chr = $i[0]; | |
79 my $feature = $i[2]; | |
80 my $strand = $i[6]; | |
81 my $start = $i[3]; | |
82 my $stop = $i[4]; | |
83 my $inf = $i[8]; | |
84 if ($feature eq 'gene') | |
85 { | |
86 if ($inf =~/Name=([\w\-\.]+)[;\s]*/){$inf = $1;} | |
87 $ann{$inf}{"start"}=$start; | |
88 $ann{$inf}{"stop"}=$stop; | |
89 $ann{$inf}{"strand"}=$strand; | |
90 $chr_of_gene{$inf} = $chr; | |
91 } | |
92 } | |
93 close($G); | |
94 } | |
95 | |
96 | |
97 | |
98 my %IUPAC = | |
99 ( | |
100 '[A/G]'=> "R", | |
101 '[G/A]'=> "R", | |
102 '[C/T]'=> "Y", | |
103 '[T/C]'=> "Y", | |
104 '[T/G]'=> "K", | |
105 '[G/T]'=> "K", | |
106 '[C/G]'=> "S", | |
107 '[G/C]'=> "S", | |
108 '[A/T]'=> "W", | |
109 '[T/A]'=> "W", | |
110 '[A/C]'=> "M", | |
111 '[C/A]'=> "M", | |
112 '[C/A/T]'=> "H", | |
113 '[A/T/C]'=> "H", | |
114 '[A/C/T]'=> "H", | |
115 '[C/T/A]'=> "H", | |
116 '[T/C/A]'=> "H", | |
117 '[T/A/C]'=> "H", | |
118 '[C/A/G]'=> "V", | |
119 '[A/G/C]'=> "V", | |
120 '[A/C/G]'=> "V", | |
121 '[C/G/A]'=> "V", | |
122 '[G/C/A]'=> "V", | |
123 '[G/A/C]'=> "V", | |
124 '[C/T/G]'=> "B", | |
125 '[T/G/C]'=> "B", | |
126 '[T/C/G]'=> "B", | |
127 '[C/G/T]'=> "B", | |
128 '[G/C/T]'=> "B", | |
129 '[G/T/C]'=> "B", | |
130 '[T/A/G]'=> "D", | |
131 '[A/G/T]'=> "D", | |
132 '[A/T/G]'=> "D", | |
133 '[T/G/A]'=> "D", | |
134 '[G/T/A]'=> "D", | |
135 '[G/A/T]'=> "D", | |
136 ); | |
137 | |
138 my %snps_of_gene; | |
139 my %snps_of_gene2; | |
140 my %indiv_order; | |
141 my $indiv_list; | |
142 my %genotyping_infos; | |
143 my $num_line = 0; | |
144 my $genename_rank_in_snpeff = 4; | |
145 | |
146 my $find_annotations = `grep -c 'EFF=' $input`; | |
147 | |
148 open(my $HAPMAP,">$out.hapmap"); | |
149 print $HAPMAP "rs# alleles chrom pos gene feature effect codon_change amino_acid_change MAF missing_data"; | |
150 open(my $VCF,$input); | |
151 while(<$VCF>) | |
152 { | |
153 my $line = $_; | |
154 chomp($line); | |
155 my @infos = split(/\t/,$line); | |
156 | |
157 if (/^##INFO=\<ID=EFF/ && /Amino_Acid_length \| Gene_Name \| Transcript_BioType \| Gene_Coding/) | |
158 { | |
159 $genename_rank_in_snpeff = 8; | |
160 } | |
161 | |
162 if (scalar @infos > 9) | |
163 { | |
164 if (/#CHROM/) | |
165 { | |
166 for (my $j=9;$j<=$#infos;$j++) | |
167 { | |
168 my $individu = $infos[$j]; | |
169 $indiv_list .= " $individu"; | |
170 $indiv_order{$j} = $individu; | |
171 } | |
172 print $HAPMAP "$indiv_list\n"; | |
173 } | |
174 elsif (!/^#/) | |
175 { | |
176 $num_line++; | |
177 | |
178 my $chromosome = $infos[0]; | |
179 my $chromosome_position = $infos[1]; | |
180 my $ref_allele = $infos[3]; | |
181 my $alt_allele = $infos[4]; | |
182 | |
183 if ($ref_allele =~/\w\w+/) | |
184 { | |
185 $ref_allele = "A"; | |
186 $alt_allele = "T"; | |
187 } | |
188 elsif ($alt_allele =~/\w\w+/) | |
189 { | |
190 $ref_allele = "T"; | |
191 $alt_allele = "A"; | |
192 } | |
193 | |
194 my $info = $infos[7]; | |
195 my $is_in_exon = "#"; | |
196 my $is_synonyme = "#"; | |
197 my $gene; | |
198 if ($find_annotations > 1) | |
199 { | |
200 $gene = "intergenic"; | |
201 } | |
202 else | |
203 { | |
204 $gene = $chromosome; | |
205 } | |
206 my $modif_codon = "#"; | |
207 my $modif_aa = "#"; | |
208 my $geneposition; | |
209 if ($info =~/EFF=(.*)/) | |
210 { | |
211 my @annotations = split(",",$1); | |
212 foreach my $annotation(@annotations) | |
213 { | |
214 my ($syn, $additional) = split(/\(/,$annotation); | |
215 | |
216 if ($syn =~/STREAM/){next;} | |
217 | |
218 $is_in_exon = "exon"; | |
219 if ($syn =~/UTR/) | |
220 { | |
221 $is_in_exon = $syn; | |
222 } | |
223 else | |
224 { | |
225 $is_synonyme = $syn; | |
226 } | |
227 my @infos_additional = split(/\|/,$additional); | |
228 $gene = $infos_additional[$genename_rank_in_snpeff]; | |
229 $modif_codon = $infos_additional[2]; | |
230 $modif_aa = $infos_additional[3]; | |
231 | |
232 if ($syn =~/INTERGENIC/) | |
233 { | |
234 $is_synonyme = "#"; | |
235 $gene = "intergenic"; | |
236 $is_in_exon = "#"; | |
237 } | |
238 elsif ($syn =~/SYNONYM/) | |
239 { | |
240 $is_in_exon = "exon"; | |
241 } | |
242 elsif ($syn =~/INTRON/ or $syn =~/SPLICE_SITE_DONOR/) | |
243 { | |
244 $is_in_exon = "intron"; | |
245 $is_synonyme = "#"; | |
246 } | |
247 | |
248 | |
249 if ($modif_aa =~/(\w)(\d+)$/) | |
250 { | |
251 $modif_aa = "$1/$1"; | |
252 } | |
253 elsif ($modif_aa =~/(\w)(\d+)(\w)/) | |
254 { | |
255 $modif_aa = "$1/$3"; | |
256 } | |
257 if ($infos_additional[8] =~/Exon/) | |
258 { | |
259 $is_in_exon = "exon"; | |
260 } | |
261 | |
262 if (!$modif_aa){$modif_aa="#";} | |
263 if (!$modif_codon){$modif_codon="#";} | |
264 } | |
265 } | |
266 $gene =~s/\.\d//g; | |
267 if ($ann{$gene}{"start"}) | |
268 { | |
269 my $strand = $ann{$gene}{"strand"}; | |
270 if ($strand eq '-') | |
271 { | |
272 $geneposition = $ann{$gene}{"stop"} - $chromosome_position; | |
273 } | |
274 else | |
275 { | |
276 $geneposition = $chromosome_position - $ann{$gene}{"start"}; | |
277 } | |
278 } | |
279 #if ($info =~/GenePos=(\d+);/) | |
280 #{ | |
281 # $geneposition = $1; | |
282 #} | |
283 my $ratio_missing_data; | |
284 my $snp_frequency; | |
285 my $genotyping = ""; | |
286 | |
287 if (2 > 1) | |
288 { | |
289 | |
290 $genotyping_infos{"ref"} = "$ref_allele$ref_allele"; | |
291 | |
292 my %alleles_found; | |
293 my $nb_readable_ind = 0; | |
294 for (my $i = 9;$i <= $#infos;$i++) | |
295 { | |
296 my $dnasample = $indiv_order{$i}; | |
297 my @infos_alleles = split(":",$infos[$i]); | |
298 my $genotype = $infos_alleles[0]; | |
299 $genotype =~s/0/$ref_allele/g; | |
300 | |
301 if ($alt_allele =~/,/) | |
302 { | |
303 my @alt_alleles = split(",",$alt_allele); | |
304 my $num_all = 1; | |
305 foreach my $alt_al(@alt_alleles) | |
306 { | |
307 $genotype =~s/$num_all/$alt_al/g; | |
308 $num_all++; | |
309 } | |
310 } | |
311 else | |
312 { | |
313 $genotype =~s/1/$alt_allele/g; | |
314 } | |
315 if ($genotype eq '.'){$genotype = "./.";} | |
316 $genotype =~s/\./N/g; | |
317 if ($genotype !~/N\/N/) | |
318 { | |
319 $nb_readable_ind++; | |
320 } | |
321 my @alleles; | |
322 if ($genotype =~/\//) | |
323 { | |
324 @alleles = split(/\//,$genotype); | |
325 } | |
326 else | |
327 { | |
328 @alleles = split(/\|/,$genotype); | |
329 } | |
330 $genotyping .= join("",@alleles) . " "; | |
331 $genotyping_infos{$dnasample} = join("",@alleles); | |
332 | |
333 foreach my $al(@alleles) | |
334 { | |
335 if ($al ne 'N'){$alleles_found{$al}++;} | |
336 } | |
337 } | |
338 chop($genotyping); | |
339 | |
340 $snp_frequency = 0; | |
341 | |
342 my $max = 0; | |
343 my $min = 10000000; | |
344 my $total = 0; | |
345 foreach my $al(keys(%alleles_found)) | |
346 { | |
347 my $nb = $alleles_found{$al}; | |
348 $total+= $nb; | |
349 if ($nb > $max) | |
350 { | |
351 $max = $nb; | |
352 } | |
353 if ($nb < $min) | |
354 { | |
355 $min = $nb; | |
356 } | |
357 } | |
358 if ($total > 0) | |
359 { | |
360 $snp_frequency = sprintf("%.1f",($min/$total)*100); | |
361 } | |
362 | |
363 $ratio_missing_data = 100 - ($nb_readable_ind / ($#infos - 8)) * 100; | |
364 $ratio_missing_data = sprintf("%.1f",$ratio_missing_data); | |
365 | |
366 foreach my $dna(keys(%genotyping_infos)) | |
367 { | |
368 $snps_of_gene{$gene}{$geneposition}{$dna} = $genotyping_infos{$dna}; | |
369 } | |
370 } | |
371 my $snp_type = "[$ref_allele/$alt_allele]"; | |
372 $snps_of_gene2{$chromosome}{$chromosome_position} = $snp_type; | |
373 #print $HAPMAP "$chromosome:$chromosome_position\t$snp_type\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$nb_readable_ind\t$genotyping\n"; | |
374 print $HAPMAP "$chromosome:$chromosome_position\t$ref_allele/$alt_allele\t$chromosome\t$chromosome_position\t$gene:$geneposition\t$is_in_exon\t$is_synonyme\t$modif_codon\t$modif_aa\t$snp_frequency%\t$ratio_missing_data%\t$genotyping\n"; | |
375 } | |
376 } | |
377 } | |
378 close($VCF); | |
379 close($HAPMAP); | |
380 | |
381 | |
382 if (!$reference){exit;} | |
383 | |
384 ################################################################# | |
385 # generate flanking sequences for Illumina VeraCode technology | |
386 ################################################################# | |
387 open(my $FLANKING,">$out.flanking.txt"); | |
388 foreach my $seq(keys(%ref_sequences)) | |
389 { | |
390 if ($snps_of_gene2{$seq}) | |
391 { | |
392 my $refhash = $snps_of_gene2{$seq}; | |
393 my %hashreal = %$refhash; | |
394 | |
395 # create consensus | |
396 my $refseq = $ref_sequences{$seq}; | |
397 my $consensus = ""; | |
398 my $previous = 0; | |
399 foreach my $pos(sort {$a<=>$b} keys(%hashreal)) | |
400 { | |
401 my $length = $pos - $previous - 1; | |
402 $consensus .= substr($refseq,$previous,$length); | |
403 my $iupac_code = $IUPAC{$snps_of_gene2{$seq}{$pos}}; | |
404 $consensus .= $iupac_code; | |
405 $previous = $pos; | |
406 } | |
407 my $length = length($refseq) - $previous; | |
408 $consensus .= substr($refseq,$previous,$length); | |
409 | |
410 foreach my $pos(sort {$a<=>$b} keys(%hashreal)) | |
411 { | |
412 my $snp_name = "$seq-$pos"; | |
413 my $flanking_length = 60; | |
414 my $length = $flanking_length; | |
415 my $start = $pos - $flanking_length - 1; | |
416 if ($pos <= $flanking_length) | |
417 { | |
418 $length = $pos - 1; | |
419 $start = 0; | |
420 } | |
421 my $sequence = substr($consensus,$start,$length); | |
422 | |
423 $sequence .= $snps_of_gene2{$seq}{$pos}; | |
424 $sequence .= substr($consensus,$pos,$flanking_length); | |
425 print $FLANKING "$snp_name,$sequence,0,0,0,Project_name,0,diploid,Other,Forward\n"; | |
426 } | |
427 } | |
428 } | |
429 | |
430 close($FLANKING); | |
431 | |
432 if (!$gff){exit;} | |
433 | |
434 my @individuals_list = split(/\t/,$indiv_list); | |
435 if ((scalar @individuals_list * scalar keys(%snps_of_gene)) > 800000) | |
436 { | |
437 print "Sorry, too many sequences to manage ...\n"; | |
438 exit; | |
439 } | |
440 | |
441 open(my $ALIGN_EGGLIB,">$out.gene_alignment.fas"); | |
442 my %alignments_ind; | |
443 foreach my $seq(keys(%snps_of_gene)) | |
444 { | |
445 if ($snps_of_gene{$seq}) | |
446 { | |
447 my $refhash = $snps_of_gene{$seq}; | |
448 my %hashreal = %$refhash; | |
449 | |
450 # get flanking sequences | |
451 my %flanking5; | |
452 my $start = $ann{$seq}{"start"}; | |
453 my $stop = $ann{$seq}{"stop"}; | |
454 my $strand = $ann{$seq}{"strand"}; | |
455 my $genelength = $stop - $start+1; | |
456 my $chr = $chr_of_gene{$seq}; | |
457 my $refseq = substr($ref_sequences{$chr},$start-1,$genelength); | |
458 if ($strand eq '-') | |
459 { | |
460 $refseq =~ tr /atcgATCG/tagcTAGC/; $refseq = reverse($refseq); | |
461 } | |
462 #print "$seq $chr $start $stop $refseq \n"; | |
463 my $previous = 0; | |
464 foreach my $pos(sort {$a<=>$b} keys(%hashreal)) | |
465 { | |
466 my $length = $pos - $previous - 1; | |
467 $flanking5{$pos} = substr($refseq,$previous,$length); | |
468 $previous = $pos; | |
469 } | |
470 my $length = length($refseq) - $previous; | |
471 my $flanking3 = substr($refseq,$previous,$length); | |
472 foreach my $ind(@individuals_list) | |
473 { | |
474 my $nb_missing_data_for_this_individual = 0; | |
475 if ($ind) | |
476 { | |
477 my $alignment_for_ind = ""; | |
478 my $seq_without_underscore = $seq; | |
479 $seq_without_underscore =~s/_//g; | |
480 $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_1\n"; | |
481 foreach my $pos(sort {$a<=>$b} keys(%hashreal)) | |
482 { | |
483 $alignment_for_ind .= $flanking5{$pos}; | |
484 my $geno = $snps_of_gene{$seq}{$pos}{$ind}; | |
485 $geno =~s/N/?/g; | |
486 if ($geno =~/\?/){$nb_missing_data_for_this_individual++;} | |
487 my @alleles = split("",$geno); | |
488 $alignment_for_ind .= $alleles[0]; | |
489 if ($alleles[0] eq $alleles[1]) | |
490 { | |
491 $alignments_ind{$ind} .= $alleles[1]; | |
492 } | |
493 else | |
494 { | |
495 my $snp_type = "[" . $alleles[0] . "/" . $alleles[1] . "]"; | |
496 $alignments_ind{$ind} .= $IUPAC{$snp_type}; | |
497 } | |
498 } | |
499 $alignment_for_ind .= $flanking3; | |
500 $alignment_for_ind .= "\n"; | |
501 | |
502 | |
503 $alignment_for_ind .= ">$seq_without_underscore" . "_$ind" . "_2\n"; | |
504 foreach my $pos(sort {$a<=>$b} keys(%hashreal)) | |
505 { | |
506 $alignment_for_ind .= $flanking5{$pos}; | |
507 my $geno = $snps_of_gene{$seq}{$pos}{$ind}; | |
508 $geno =~s/N/?/g; | |
509 my @alleles = split("",$geno); | |
510 $alignment_for_ind .= $alleles[1]; | |
511 } | |
512 $alignment_for_ind .= $flanking3; | |
513 $alignment_for_ind .= "\n"; | |
514 if (keys(%hashreal) != $nb_missing_data_for_this_individual) | |
515 { | |
516 print $ALIGN_EGGLIB $alignment_for_ind; | |
517 } | |
518 } | |
519 } | |
520 } | |
521 } | |
522 close($ALIGN_EGGLIB); | |
523 | |
524 | |
525 |