comparison DC_Genotyper.pl @ 11:845a87ad254a draft

re-upload, accidentally removed some files
author geert-vandeweyer
date Sat, 27 Sep 2014 05:38:17 -0400
parents
children 8cc26598eeac
comparison
equal deleted inserted replaced
10:a7e7f9b8f538 11:845a87ad254a
1 #!/usr/bin/perl
2 #############################
3 ## DEEP COVERAGE GENOTYPER ##
4 #############################
5 # version : 0.0.1
6 # Principle:
7 # 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !!
8 # 2. remove known dbsnp positions (bcf file)
9 # 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions.
10 # 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction)
11 # 5. For abberant positions, check each alternate allele to see if it passes the background signal.
12 # 6. Generate VCF file.
13
14
15 ##################
16 ## LOAD MODULES ##
17 ##################
18 use threads;
19 use threads::shared;
20 use Thread::Queue;
21 use Getopt::Std;
22
23 ####################
24 ## get paramaters ##
25 ####################
26 # t: target file
27 # b: bam file
28 # R: 2bit version of reference fasta.
29 # p: number of threads.
30 # s: dbsnp file
31 # m: minimal coverage (defaults 400x)
32 # P: ploidy
33 # a: outfile for allele distributions
34 # v: vcf file output.
35 getopts('t:b:R:p:s:m:P:v:a:', \%opts) ;
36
37 ## variables
38 my $twobit :shared;
39 my $igvgenome :shared;
40 if (!defined($opts{'R'})) {
41 die("Reference Genomes not specified\n");
42 }
43 my @refgenomes = split(",",$opts{'r'});
44 if (!-e $refgenomes[0]) {
45 die("'$refgenomes[0]' is not a valid file path.");
46 }
47 else {
48 $twobit = $refgenomes[0];
49 }
50 if (!-e $refgenomes[1]) {
51 die("'$refgenomes[1]' is not a valid file path.");
52 }
53 else {
54 $igvgenome = $refgenomes[1];
55 }
56
57
58 my $mincov :shared;
59 $mincov = 320;
60 if (defined($opts{'m'})) {
61 $mincov = $opts{'m'};
62 }
63
64 my $ploidy :shared;
65 if (defined($opts{'P'}) && $opts{'P'} =~ m/^\d+$/) {
66 $ploidy = $opts{'P'};
67 }
68 else {
69 die("Ploidy (-P) was not specified or not an integer\n");
70 }
71
72
73 if (defined($opts{'v'})) {
74 $outfile = $opts{'v'};
75 }
76 else {
77 die("No output vcf-file specified.\n");
78 }
79 if (!defined($opts{'a'})) {
80 die("No output file specified for distribution details\n");
81 }
82 ## create working dir.
83 my $rand = int(rand(10000));
84 while (-d "/tmp/DC_Genotyper_$rand") {
85 $rand = int(rand(10000));
86 }
87 my $wd :shared;
88 $wd = "/tmp/DC_Genotyper_$rand";
89 system("mkdir '$wd'");
90
91
92 my $snpfile :shared;
93 my $hassnp :shared;
94 $hassnp = 'NoDbSNP';
95 $snpfile = '';
96 if (defined($opts{'s'})) {
97 $snpfile = $opts{'s'};
98 if (!-e $snpfile) {
99 die("'$snpfile' is not a valid file path.");
100 }
101
102 my $mime = `file $snpfile`;
103 if ($mime !~ m/compressed/) {
104 print "$snpfile is not in compressed format. compressing & indexing the file now.\n";
105 #print "... this takes a while\n";
106 system("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz");
107 system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");
108 $snpfile = "$wd/dbSNP.vcf.bgz";
109 }
110 elsif (!-e "$snpfile.tbi") {
111 print "tabix index file is missing for '$snpfile'. creating now.\n";
112 ## check if I can write it out for future use
113 $snpfile =~ m/(.*)([^\/]+)$/;
114 my $d = $1;
115 if (-w $d) {
116 open OUT, ">$d/lock";
117 flock(OUT,2);
118 system("cd $d && tabix -p vcf $snpfile");
119 close OUT;
120 system("rm $d/lock");
121 }
122 else {
123 system("cp $snpfile /$wd/dbSNP.vcf.bgz");
124 system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");
125 $snpfile = "$wd/dbSNP.vcf.bgz";
126 }
127 }
128 $hassnp = 'WithDbSNP';
129 }
130
131
132 ## 1. Get FASTA and prepare output hashes:
133 my $targets_one = Thread::Queue->new();
134 my $targets_two = Thread::Queue->new();
135 my $targets_three = Thread::Queue->new();
136 open IN, $opts{'t'} or die("Could not open $opts{'t'} file for reading");
137 if (-d "$wd/Fasta/") {
138 system("rm $wd/Fasta/*");
139 }
140 else {
141 system("mkdir $wd/Fasta");
142 }
143 ## create the threads.
144 for (my $i = 1; $i<= $opts{'p'}; $i++) {
145 ${'thr'.$i} = threads->create('FetchFasta');
146 }
147
148 ## enqueue the targets.
149 my %thash;
150 while (<IN>) {
151 chomp;
152 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$_);
153 $targets_one->enqueue($_);
154 $targets_two->enqueue($_);
155 $targets_three->enqueue($_);
156 $thash{$chr}{$start} = $stop;
157 }
158 close IN;
159
160 ## end the threads.
161 for (my $i = 1; $i <= $opts{'p'}; $i++) {
162 $targets_one->enqueue(undef);
163 }
164
165 for (my $i = 1; $i <= $opts{'p'}; $i++) {
166 ${'thr'.$i}->join();
167 }
168
169 ## load dbSNP inside target regions into shared structure.
170 ##########################################################
171 my %dbsnp :shared;
172 if ($snpfile ne '') {
173 my $bcf = `which bcftools`;
174 chomp($bcf);
175 if ($bcf ne '') {
176 my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt";
177 system("$command");
178 open IN, "$wd/dbsnp.txt";
179 while (<IN>) {
180 chomp;
181 my @p = split(/\t/,$_);
182 $dbsnp{$p[0].'-'.$p[1]} = $p[2].'-'.$p[3].'-'.$p[4];
183 }
184 close IN;
185 }
186 else {
187 print "WARNING: BCFtools is not in the path. Skipping snp handling.\n";
188 $snpfile = '';
189 system("touch $wd/dbsnp.txt");
190 }
191 }
192 else {
193 system("touch $wd/dbsnp.txt");
194 }
195
196 ## now process the bam file.
197 mkdir "$wd/WIGS/";
198 my $bam :shared;
199 $bam = $opts{'b'};
200 my $bai = $bam.".bai";
201 if (!-e $bai) {
202 print "BAI ($bai) missing for $bam : creating\n";
203 system("samtools index $bam");
204 }
205
206 for (my $i = 1; $i <= $opts{'p'}; $i++) {
207 ${'thr'.$i} = threads->create('CountAlleles');
208 }
209 ## end the threads.
210 for (my $i = 1; $i <= $opts{'p'}; $i++) {
211 $targets_two->enqueue(undef);
212 }
213
214 for (my $i = 1; $i <= $opts{'p'}; $i++) {
215 ${'thr'.$i}->join();
216 }
217
218 ## generate the distributions.
219 ##############################
220 my $alleles = Thread::Queue->new();
221 my %all = ('A' => 1,'C' => 2,'G' => 3, 'T' => 4);
222 foreach(keys(%all)) {
223 $alleles->enqueue($_);
224 my $a = $_;
225 foreach(keys(%all)) {
226 if ($_ eq $a) {
227 next;
228 }
229 $alleles->enqueue($a.'-'.$_);
230 }
231 }
232 for (my $i = 1; $i <= $opts{'p'}; $i++) {
233 ${'thr'.$i} = threads->create('GetDistribution');
234 }
235 ## end the threads.
236 for (my $i = 1; $i <= $opts{'p'}; $i++) {
237 $alleles->enqueue(undef);
238 }
239
240 for (my $i = 1; $i <= $opts{'p'}; $i++) {
241 ${'thr'.$i}->join();
242 }
243
244 ## group distributions into one file
245 ####################################
246 my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5);
247 open OUT, ">".$opts{'a'};
248 print OUT "allele\tavg\tsd\n";
249 foreach(keys(%map)) {
250 my $r = $_;
251 my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt";
252 open IN, "$f";
253 my $a = <IN>;
254 chomp($a);
255 #$dists{$r}{'avg'} = $a;
256 my $s = <IN>;
257 chomp($s);
258 #$dists{$r}{'sd'} = $s;
259 close IN;
260 print OUT "$r\t$a\t$s\n";
261 foreach(keys(%map)) {
262 if ($_ eq $r) {
263 next;
264 }
265 my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt";
266 open IN, "$f";
267 my $a = <IN>;
268 chomp($a);
269 my $s = <IN>;
270 chomp($s);
271 close IN;
272 print OUT "$r-$_\t$a\t$s\n";
273 }
274 }
275 close OUT;
276
277 ## CALL SNPs
278 ############
279 # create the R script.
280 open R, ">$wd/CallSNPs.R";
281 print R "args <- commandArgs(trailingOnly = TRUE)\n";
282 print R "counts <- read.table(file=args[1],header=FALSE, as.is=TRUE)\n";
283 print R "ploidy <- as.integer(args[3])\n";
284 print R "chr <- args[2]\n";
285 print R "snps <- read.table(file=args[5],header=FALSE,as.is=TRUE)\n";
286 print R "colnames(snps) <- c('chr','pos','ref','alt','id')\n";
287 print R "colnames(counts) <- c('pos','ref','A','C','G','T','TotalDepth')\n";
288 print R "dists <- read.table(file='$wd/allelic_distributions.txt',header=TRUE,as.is=TRUE)\n";
289 print R 'rownames(dists) = dists$allele'."\n";
290 print R 'dists <- dists[,-1]'."\n";
291 print R "vcf <- c()\n";
292 print R "lower <- c()\n";
293 print R "higher <- c()\n";
294 print R "for (i in 1:(ploidy)) {\n";
295 print R " lower[length(lower)+1] <- (2*i-1)/(2*ploidy)\n";
296 print R " higher[length(higher)+1] <- (2*i+1)/(2*ploidy)\n";
297 print R "}\n";
298 print R "for (i in 1:nrow(counts)) {\n";
299 print R " if (counts[i,'TotalDepth'] == 0) next\n";
300 print R " # significantly different from reference?\n";
301 print R " z <- ((counts[i,counts[i,'ref']]/counts[i,'TotalDepth']) - dists[counts[i,'ref'],'avg']) / dists[counts[i,'ref'],'sd']\n";
302 print R " if (abs(z) > 3) {\n";
303 print R " # test all alterate alleles to see which one is significant.\n";
304 print R " for (j in c('A','C','G','T')) {\n";
305 print R " if (j == counts[i,'ref']) next\n";
306 print R " z <- ((counts[i,j]/counts[i,'TotalDepth']) - dists[paste(counts[i,'ref'],'-',j,sep=''),'avg']) / dists[paste(counts[i,'ref'],'-',j,sep=''),'sd']\n";
307 print R " if (abs(z) > 3){\n";
308 print R " filter <- 'PASS'\n";
309 print R " phred <- round(-10*log(pnorm(-abs(z))),digits=0)\n";
310 print R " if (phred > 9999) phred <- 9999\n";
311 print R " frac <- counts[i,j]/counts[i,'TotalDepth']\n";
312 print R " for (k in 1:ploidy) {\n";
313 print R " if (frac >= lower[k] && frac < higher[k]) {\n";
314 print R " sample <- paste(paste(paste(rep(0,(ploidy-k)),sep='',collapse='/'),paste(rep(1,k),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n";
315 print R " af <- k/ploidy\n";
316 print R " break\n";
317 print R " }\n";
318 print R " }\n";
319 print R " if (frac < lower[1]) {\n";
320 print R " sample <- paste(paste(paste(rep(0,(ploidy-1)),sep='',collapse='/'),paste(rep(1,1),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n";
321 print R " af <- 1/ploidy\n";
322 print R " filter <- 'LowFraction'\n";
323 print R " }\n";
324 print R " if (counts[i,'TotalDepth'] < $mincov) {\n";
325 print R " filter <- 'LowCoverage'\n";
326 print R " }\n";
327 print R " info <- paste('DP=',counts[i,'TotalDepth'],';AF=',round(af,digits=5),';AR=',round(frac,digits=5),sep='')\n";
328 print R " snpids <- which(snps\$chr == chr & snps\$pos == counts[i,'pos'])\n";
329 print R " id <- '.'\n";
330 print R " if (length(snpids) > 0) id <- snps[snpids[1],'id']\n";
331 print R " vcf[length(vcf)+1] <- paste(chr,counts[i,'pos'],id,counts[i,'ref'],j,phred,filter,info,'GT:AD',sample,sep='\\t',collapse='')\n";
332 print R " }\n";
333 print R " }\n";
334 print R " }\n";
335 print R "}\n";
336 print R "if (length(vcf) > 0) {\n";
337 print R " write(file=args[4],paste(vcf,sep='\\n'))\n";
338 print R "}\n";
339 close R;
340 system("mkdir $wd/VCF/");
341 for (my $i = 1; $i <= $opts{'p'}; $i++) {
342 ${'thr'.$i} = threads->create('CallSNPs');
343 }
344 ## end the threads.
345 for (my $i = 1; $i <= $opts{'p'}; $i++) {
346 $targets_three->enqueue(undef);
347 }
348
349 for (my $i = 1; $i <= $opts{'p'}; $i++) {
350 ${'thr'.$i}->join();
351 }
352
353 ## BUILD FINAL VCF
354 open OUT, ">$outfile";
355 print OUT "##fileformat=VCFv4.1\n";
356 print OUT "##source=High_Ploidy_Genotyper_v.0.1\n";
357 print OUT "##genome_reference=$twobit\n";
358 if ($snpfile ne '') {
359 print OUT "##SNP_file=$snpfile\n";
360 }
361 foreach(keys(%thash)) {
362 print OUT "##contig=<ID=$_,assembly=hg19,species=\"Homo Sapiens\">\n";
363 }
364 print OUT "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n";
365 print OUT "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n";
366 print OUT "##INFO=<ID=AR,Number=1,Type=Float,Description=\"Allelic Ratio\">\n";
367 print OUT "##FILTER=<ID=LowFraction,Description=\"Allelic Fraction under 1/2*$ploidy\">\n";
368 print OUT "##FILTER=<ID=LowCoverage,Description=\"Total Depth is lower than threshold of $mincov\">\n";
369 print OUT "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
370 print OUT "##FORMAT=<ID=AD,Number=2,type=Integer,Description,\"Allelic Depth\">\n";
371 print OUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n";
372 close OUT;
373 @i = ( 1 .. 22,'X','Y','M' );
374 foreach(@i) {
375 my $chr = "chr$_";
376 foreach(sort {$a <=> $b} keys(%{$thash{$chr}})) {
377 my $v = "$wd/VCF/$chr.$_-".$thash{$chr}{$_}.".vcf";
378 if (-e $v) {
379 system("cat '$v' >> '$outfile'");
380 }
381 }
382 }
383
384 ## clean up
385 system("rm -Rf '$wd'");
386
387 sub FetchFasta {
388 while(defined(my $line = $targets_one->dequeue())) {
389 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line);
390 # 2bit is zero based, non-including => decrease start by one
391 $startposition = $start - 1;
392 my $command = "twoBitToFa -seq=$chr -start=$startposition -end=$stop -noMask $twobit $wd/Fasta/$chr-$start-$stop.fasta";
393 system($command);
394 }
395 }
396
397 sub CountAlleles {
398 # local version of hashes
399 my $snp = \%dbsnp;
400 my %counts;
401 $counts{'A'} = '';
402 $counts{'C'} = '';
403 $counts{'G'} = '';
404 $counts{'T'} = '';
405 my %map =('A' => 1,'C' => 2,'G' => 3, 'T' => 4);
406 my %options;
407 foreach(keys(%map)) {
408 my $r = $_;
409 foreach(keys(%map)) {
410 if ($_ eq $r) {
411 next;
412 }
413 $options{$r.'-'.$_} = '';
414 }
415 }
416 while (defined(my $line = $targets_two->dequeue())) {
417 $out = '';
418 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line);
419 ## get reference alleles
420 my %ref_alleles;
421 open FASTA, "$wd/Fasta/$chr-$start-$stop.fasta";
422 my $head = <FASTA>;
423 my $seq = '';
424 while (<FASTA>) {
425 chomp;
426 $seq .= $_;
427 }
428 close FASTA;
429 # this generates a hash of the reference alleles once, instead of substr-calls in every bam, on every iteration.
430 for (my $pos = 0; $pos < length($seq); $pos++) {
431 $ref_alleles{($pos+$start)} = substr($seq,$pos,1);
432 }
433 ## get counts.
434 my $target = "$chr:$start-$stop";
435 my $command = "igvtools count -w 1 --bases --query '$target' '$bam' '$wd/WIGS/$chr-$start-$stop.wig' '$igvgenome' > /dev/null 2>&1";
436 system($command);
437 open WIG, "$wd/WIGS/$chr-$start-$stop.wig";
438 my $h = <WIG>;
439 $h = <WIG>;
440 $h = <WIG>;
441 my $target_counts = '';
442 while (<WIG>) {
443 chomp;
444 #my ($pos, $a, $c, $g, $t , $n) = split(/\t/,$_);
445 my @p = split(/\t/,$_);
446 my $s = $p[1] + $p[2] + $p[3] + $p[4];
447 $target_counts .= "$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$s\n";
448 ## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty).
449 if ($s > $mincov && !defined($snp->{$chr.'-'.$p[0]})) {
450 ## for model of 'non-reference'
451 my $frac = $p[$map{$ref_alleles{$p[0]}}] / $s;
452 $counts{$ref_alleles{$p[0]}} .= $frac.',';
453 $out .= "$target\t$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\n";
454 ## for each of the options background models
455 foreach(keys(%map)) {
456 if ($_ eq $ref_alleles{$p[0]}) {
457 next;
458 }
459 $options{$ref_alleles{$p[0]}.'-'.$_} .= ($p[$map{$_}] / $s) .',';
460 }
461
462 }
463 }
464 close WIG;
465 open OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt";
466 flock(OUT, 2);
467 print OUT $out;
468 close OUT;
469 open OUT, ">$wd/WIGS/$chr.$start-$stop.txt";
470 print OUT $target_counts;
471 close OUT;
472
473 }
474 foreach(keys(%counts)) {
475 open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";
476 flock(OUT,2);
477 print OUT $counts{$_};
478 close OUT;
479 }
480 foreach(keys(%options)) {
481 open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";
482 flock(OUT,2);
483 print OUT $options{$_};
484 close OUT;
485 }
486 }
487
488 sub GetDistribution {
489 while (defined(my $allele = $alleles->dequeue())) {
490 system("sed -i 's/.\$//' '$wd/counts_$allele.$mincov"."x.$hassnp.txt'");
491 open OUT, ">$wd/GetDistribution.$allele.R";
492 print OUT "sample <- '$bam'\n";
493 print OUT "nt <- '$allele'\n";
494 #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n";
495 print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n";
496 print OUT "nr <- length(data)\n";
497 print OUT "avg <- mean(data)\n";
498 print OUT "sdd <- sd(data)\n";
499 #print OUT "if (avg > 0.5) {\n";
500 #print OUT " x <- seq(0.8,1,length=1000)\n";
501 #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n";
502 #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n";
503 #print OUT " abline(v=(avg-3*sdd),col='red')\n";
504 #print OUT " text(0.81,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n";
505 #print OUT "} else {\n";
506 #print OUT " x <- seq(0,0.3,length=1000)\n";
507 #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n";
508 #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n";
509 #print OUT " abline(v=(avg+3*sdd),col='red')\n";
510 #print OUT " text(0.2,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n";
511 #print OUT "}\n";
512 #print OUT "dev.off()\n";
513 print OUT "write(c(avg,sdd),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n";
514 close OUT;
515 system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1");
516 }
517 }
518
519
520 sub CallSNPs {
521 while (defined(my $line = $targets_three->dequeue())) {
522 # split.
523 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line);
524 my $file = "$wd/WIGS/$chr.$start-$stop.txt";
525 my $ofile = "$wd/VCF/$chr.$start-$stop.vcf";
526 system("cd $wd && Rscript CallSNPs.R '$file' '$chr' '$ploidy' '$ofile' '$wd/dbsnp.txt'");
527 }
528
529 }
530