Mercurial > repos > geert-vandeweyer > dc_genotyper
annotate DC_Genotyper.pl @ 21:36e1ba3b4d1b draft
updated perl-threading modules, due to cpan update
author | geert-vandeweyer |
---|---|
date | Thu, 07 Jan 2016 05:08:45 -0500 |
parents | 8262299f8f3c |
children |
rev | line source |
---|---|
11 | 1 #!/usr/bin/perl |
2 ############################# | |
3 ## DEEP COVERAGE GENOTYPER ## | |
4 ############################# | |
5 # version : 0.0.1 | |
6 # Principle: | |
7 # 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !! | |
8 # 2. remove known dbsnp positions (bcf file) | |
9 # 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions. | |
10 # 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction) | |
11 # 5. For abberant positions, check each alternate allele to see if it passes the background signal. | |
12 # 6. Generate VCF file. | |
13 | |
14 | |
15 ################## | |
16 ## LOAD MODULES ## | |
17 ################## | |
18 use threads; | |
19 use threads::shared; | |
20 use Thread::Queue; | |
21 use Getopt::Std; | |
22 | |
23 #################### | |
24 ## get paramaters ## | |
25 #################### | |
26 # t: target file | |
27 # b: bam file | |
12 | 28 # R: reference genome files for twobit and IGV. |
11 | 29 # p: number of threads. |
30 # s: dbsnp file | |
31 # m: minimal coverage (defaults 400x) | |
32 # P: ploidy | |
33 # a: outfile for allele distributions | |
34 # v: vcf file output. | |
17 | 35 # d: distribution plots pdf file |
36 getopts('t:b:R:p:s:m:P:v:a:d:', \%opts) ; | |
11 | 37 |
38 ## variables | |
39 my $twobit :shared; | |
40 my $igvgenome :shared; | |
41 if (!defined($opts{'R'})) { | |
42 die("Reference Genomes not specified\n"); | |
43 } | |
12 | 44 my @refgenomes = split(",",$opts{'R'}); |
11 | 45 if (!-e $refgenomes[0]) { |
46 die("'$refgenomes[0]' is not a valid file path."); | |
47 } | |
48 else { | |
49 $twobit = $refgenomes[0]; | |
50 } | |
51 if (!-e $refgenomes[1]) { | |
52 die("'$refgenomes[1]' is not a valid file path."); | |
53 } | |
54 else { | |
55 $igvgenome = $refgenomes[1]; | |
56 } | |
57 | |
58 | |
59 my $mincov :shared; | |
60 $mincov = 320; | |
61 if (defined($opts{'m'})) { | |
62 $mincov = $opts{'m'}; | |
63 } | |
64 | |
65 my $ploidy :shared; | |
66 if (defined($opts{'P'}) && $opts{'P'} =~ m/^\d+$/) { | |
67 $ploidy = $opts{'P'}; | |
68 } | |
69 else { | |
70 die("Ploidy (-P) was not specified or not an integer\n"); | |
71 } | |
72 | |
73 | |
74 if (defined($opts{'v'})) { | |
75 $outfile = $opts{'v'}; | |
76 } | |
77 else { | |
78 die("No output vcf-file specified.\n"); | |
79 } | |
80 if (!defined($opts{'a'})) { | |
81 die("No output file specified for distribution details\n"); | |
82 } | |
83 ## create working dir. | |
84 my $rand = int(rand(10000)); | |
85 while (-d "/tmp/DC_Genotyper_$rand") { | |
86 $rand = int(rand(10000)); | |
87 } | |
88 my $wd :shared; | |
89 $wd = "/tmp/DC_Genotyper_$rand"; | |
90 system("mkdir '$wd'"); | |
91 | |
92 | |
93 my $snpfile :shared; | |
94 my $hassnp :shared; | |
95 $hassnp = 'NoDbSNP'; | |
96 $snpfile = ''; | |
97 if (defined($opts{'s'})) { | |
98 $snpfile = $opts{'s'}; | |
99 if (!-e $snpfile) { | |
100 die("'$snpfile' is not a valid file path."); | |
101 } | |
102 | |
103 my $mime = `file $snpfile`; | |
104 if ($mime !~ m/compressed/) { | |
105 print "$snpfile is not in compressed format. compressing & indexing the file now.\n"; | |
106 #print "... this takes a while\n"; | |
107 system("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz"); | |
108 system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz"); | |
109 $snpfile = "$wd/dbSNP.vcf.bgz"; | |
110 } | |
111 elsif (!-e "$snpfile.tbi") { | |
112 print "tabix index file is missing for '$snpfile'. creating now.\n"; | |
113 ## check if I can write it out for future use | |
114 $snpfile =~ m/(.*)([^\/]+)$/; | |
115 my $d = $1; | |
116 if (-w $d) { | |
117 open OUT, ">$d/lock"; | |
118 flock(OUT,2); | |
119 system("cd $d && tabix -p vcf $snpfile"); | |
120 close OUT; | |
121 system("rm $d/lock"); | |
122 } | |
123 else { | |
124 system("cp $snpfile /$wd/dbSNP.vcf.bgz"); | |
125 system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz"); | |
126 $snpfile = "$wd/dbSNP.vcf.bgz"; | |
127 } | |
128 } | |
129 $hassnp = 'WithDbSNP'; | |
130 } | |
131 | |
132 | |
133 ## 1. Get FASTA and prepare output hashes: | |
134 my $targets_one = Thread::Queue->new(); | |
135 my $targets_two = Thread::Queue->new(); | |
136 my $targets_three = Thread::Queue->new(); | |
137 open IN, $opts{'t'} or die("Could not open $opts{'t'} file for reading"); | |
138 if (-d "$wd/Fasta/") { | |
139 system("rm $wd/Fasta/*"); | |
140 } | |
141 else { | |
142 system("mkdir $wd/Fasta"); | |
143 } | |
144 ## create the threads. | |
145 for (my $i = 1; $i<= $opts{'p'}; $i++) { | |
146 ${'thr'.$i} = threads->create('FetchFasta'); | |
147 } | |
148 | |
149 ## enqueue the targets. | |
150 my %thash; | |
151 while (<IN>) { | |
152 chomp; | |
153 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$_); | |
154 $targets_one->enqueue($_); | |
155 $targets_two->enqueue($_); | |
156 $targets_three->enqueue($_); | |
157 $thash{$chr}{$start} = $stop; | |
158 } | |
159 close IN; | |
160 | |
161 ## end the threads. | |
162 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
163 $targets_one->enqueue(undef); | |
164 } | |
165 | |
166 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
167 ${'thr'.$i}->join(); | |
168 } | |
169 | |
170 ## load dbSNP inside target regions into shared structure. | |
171 ########################################################## | |
172 my %dbsnp :shared; | |
173 if ($snpfile ne '') { | |
17 | 174 ## BCFTOOLS query is very very fast, but not available so far in the default bcftools version included in samtools package. |
175 # as a work-around, use tabix, but this is slower. | |
16 | 176 #my $bcf = `which bcftools`; |
177 #chomp($bcf); | |
178 #if ($bcf ne '') { | |
179 # my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt"; | |
180 # system("$command"); | |
181 # open IN, "$wd/dbsnp.txt"; | |
182 # while (<IN>) { | |
183 # chomp; | |
184 # my @p = split(/\t/,$_); | |
185 # $dbsnp{$p[0].'-'.$p[1]} = $p[2].'-'.$p[3].'-'.$p[4]; | |
186 # } | |
187 # close IN; | |
188 #} | |
189 #else { | |
190 # print "WARNING: BCFtools is not in the path. Skipping snp handling.\n"; | |
191 # $snpfile = ''; | |
192 # system("touch $wd/dbsnp.txt"); | |
193 #} | |
194 system("tabix $snpfile -B $opts{'t'} | cut -f 1-5 > $wd/dbsnp.txt"); | |
195 my $lc = `cat $wd/dbsnp.txt | wc -l`; | |
196 chomp($lc); | |
197 if ($lc eq '0') { | |
198 open SNP, ">$wd/dbsnp.txt"; | |
199 ## dummy line on chr zero | |
200 print SNP "chr0\t1\t.\tA\tT\n"; | |
201 close SNP; | |
17 | 202 } |
11 | 203 } |
204 else { | |
16 | 205 open SNP, ">$wd/dbsnp.txt"; |
17 | 206 ## dummy line on chr zero to prevent R issues on empty file. |
16 | 207 print SNP "chr0\t1\t.\tA\tT\n"; |
208 close SNP; | |
11 | 209 } |
20
8262299f8f3c
Fixed loading of dbsnp after bcftools-tabix switch.
geert-vandeweyer
parents:
19
diff
changeset
|
210 open IN, "$wd/dbsnp.txt"; |
8262299f8f3c
Fixed loading of dbsnp after bcftools-tabix switch.
geert-vandeweyer
parents:
19
diff
changeset
|
211 while (<IN>) { |
8262299f8f3c
Fixed loading of dbsnp after bcftools-tabix switch.
geert-vandeweyer
parents:
19
diff
changeset
|
212 chomp; |
8262299f8f3c
Fixed loading of dbsnp after bcftools-tabix switch.
geert-vandeweyer
parents:
19
diff
changeset
|
213 my @p = split(/\t/,$_); |
8262299f8f3c
Fixed loading of dbsnp after bcftools-tabix switch.
geert-vandeweyer
parents:
19
diff
changeset
|
214 $dbsnp{$p[0].'-'.$p[1]} = $p[3].'-'.$p[4].'-'.$p[2]; |
8262299f8f3c
Fixed loading of dbsnp after bcftools-tabix switch.
geert-vandeweyer
parents:
19
diff
changeset
|
215 } |
8262299f8f3c
Fixed loading of dbsnp after bcftools-tabix switch.
geert-vandeweyer
parents:
19
diff
changeset
|
216 close IN; |
11 | 217 |
218 ## now process the bam file. | |
219 mkdir "$wd/WIGS/"; | |
220 my $bam :shared; | |
221 $bam = $opts{'b'}; | |
17 | 222 # igvtools cannot handle the .dat extension, so make symlink |
223 system("ln -s '$bam' '$wd/input.bam'"); | |
224 system("cd $wd && samtools index input.bam"); | |
11 | 225 |
226 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
227 ${'thr'.$i} = threads->create('CountAlleles'); | |
228 } | |
229 ## end the threads. | |
230 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
231 $targets_two->enqueue(undef); | |
232 } | |
233 | |
234 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
235 ${'thr'.$i}->join(); | |
236 } | |
237 | |
238 ## generate the distributions. | |
239 ############################## | |
240 my $alleles = Thread::Queue->new(); | |
241 my %all = ('A' => 1,'C' => 2,'G' => 3, 'T' => 4); | |
242 foreach(keys(%all)) { | |
243 $alleles->enqueue($_); | |
244 my $a = $_; | |
245 foreach(keys(%all)) { | |
246 if ($_ eq $a) { | |
247 next; | |
248 } | |
249 $alleles->enqueue($a.'-'.$_); | |
250 } | |
251 } | |
252 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
253 ${'thr'.$i} = threads->create('GetDistribution'); | |
254 } | |
255 ## end the threads. | |
256 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
257 $alleles->enqueue(undef); | |
258 } | |
259 | |
260 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
261 ${'thr'.$i}->join(); | |
262 } | |
263 | |
264 ## group distributions into one file | |
265 #################################### | |
266 my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5); | |
267 open OUT, ">".$opts{'a'}; | |
17 | 268 print OUT "allele\tavg\tsd\tN\n"; |
11 | 269 foreach(keys(%map)) { |
270 my $r = $_; | |
271 my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt"; | |
272 open IN, "$f"; | |
273 my $a = <IN>; | |
274 chomp($a); | |
275 my $s = <IN>; | |
276 chomp($s); | |
17 | 277 my $n = <IN>; |
278 chomp($n); | |
11 | 279 close IN; |
17 | 280 print OUT "$r\t$a\t$s\t$n\n"; |
11 | 281 foreach(keys(%map)) { |
282 if ($_ eq $r) { | |
283 next; | |
284 } | |
285 my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt"; | |
286 open IN, "$f"; | |
287 my $a = <IN>; | |
288 chomp($a); | |
289 my $s = <IN>; | |
290 chomp($s); | |
17 | 291 my $n = <IN>; |
292 chomp($n); | |
11 | 293 close IN; |
17 | 294 print OUT "$r-$_\t$a\t$s\t$n\n"; |
11 | 295 } |
296 } | |
297 close OUT; | |
298 | |
17 | 299 ## make pdf with distribution plots |
300 ################################### | |
301 open OUT, ">$wd/MakePlots.R"; | |
302 print OUT "\n"; | |
303 print OUT "dists <- read.table(file='".$opts{'a'}."', header=TRUE, as.is=TRUE)\n"; | |
304 print OUT "pdf(file='".$opts{'d'}."',paper='a4',onefile=TRUE)\n"; | |
305 print OUT "par(mfrow=c(2, 2))\n"; | |
306 print OUT "for (i in 1:nrow(dists)) {\n"; | |
307 print OUT " if (dists[i,'avg'] > 0.5) {\n"; | |
19
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
308 print OUT " x <- seq(0.85,1,length=1000)\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
309 print OUT " y <- dnorm(x,mean=dists[i,'avg'],sd=dists[i,'sd'])\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
310 print OUT " plot(x,y,main=paste('Distribution for allele \"',dists[i,'allele'],'\"',sep=''),xlab='Allelic Ratio',type='l',lwd=1)\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
311 print OUT " abline(v=(dists[i,'avg']-3*dists[i,'sd']),col='red')\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
312 print OUT " text(0.855,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1))\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
313 print OUT " } else {\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
314 print OUT " x <- seq(0,0.15,length=1000)\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
315 print OUT " y <- dnorm(x,dists[i,'avg'],sd=dists[i,'sd'])\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
316 print OUT " plot(x,y,main=paste('Distribution for \"',dists[i,'allele'],'\" variation',sep=''),xlab='Allelic Ratio',type='l',lwd=1)\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
317 print OUT " abline(v=(dists[i,'avg']+3*dists[i,'sd']),col='red')\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
318 print OUT " text(0.1,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1))\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
319 print OUT " }\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
320 print OUT "}\n"; |
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
321 print OUT "dev.off()\n"; |
17 | 322 close OUT; |
323 system("cd $wd && Rscript MakePlots.R > /dev/null 2>&1"); | |
324 | |
11 | 325 ## CALL SNPs |
326 ############ | |
327 # create the R script. | |
328 open R, ">$wd/CallSNPs.R"; | |
16 | 329 print R "\n"; |
11 | 330 print R "args <- commandArgs(trailingOnly = TRUE)\n"; |
16 | 331 print R "counts <- read.table(file = args[1],header = FALSE, as.is = TRUE)\n"; |
11 | 332 print R "ploidy <- as.integer(args[3])\n"; |
333 print R "chr <- args[2]\n"; | |
334 print R "snps <- read.table(file=args[5],header=FALSE,as.is=TRUE)\n"; | |
16 | 335 print R "colnames(snps) <- c('chr','pos','id','ref','alt')\n"; |
11 | 336 print R "colnames(counts) <- c('pos','ref','A','C','G','T','TotalDepth')\n"; |
16 | 337 print R "dists <- read.table(file='".$opts{'a'}."',header=TRUE,as.is=TRUE)\n"; |
11 | 338 print R 'rownames(dists) = dists$allele'."\n"; |
339 print R 'dists <- dists[,-1]'."\n"; | |
340 print R "vcf <- c()\n"; | |
341 print R "lower <- c()\n"; | |
342 print R "higher <- c()\n"; | |
343 print R "for (i in 1:(ploidy)) {\n"; | |
344 print R " lower[length(lower)+1] <- (2*i-1)/(2*ploidy)\n"; | |
345 print R " higher[length(higher)+1] <- (2*i+1)/(2*ploidy)\n"; | |
346 print R "}\n"; | |
347 print R "for (i in 1:nrow(counts)) {\n"; | |
348 print R " if (counts[i,'TotalDepth'] == 0) next\n"; | |
349 print R " # significantly different from reference?\n"; | |
350 print R " z <- ((counts[i,counts[i,'ref']]/counts[i,'TotalDepth']) - dists[counts[i,'ref'],'avg']) / dists[counts[i,'ref'],'sd']\n"; | |
351 print R " if (abs(z) > 3) {\n"; | |
352 print R " # test all alterate alleles to see which one is significant.\n"; | |
353 print R " for (j in c('A','C','G','T')) {\n"; | |
354 print R " if (j == counts[i,'ref']) next\n"; | |
355 print R " z <- ((counts[i,j]/counts[i,'TotalDepth']) - dists[paste(counts[i,'ref'],'-',j,sep=''),'avg']) / dists[paste(counts[i,'ref'],'-',j,sep=''),'sd']\n"; | |
356 print R " if (abs(z) > 3){\n"; | |
357 print R " filter <- 'PASS'\n"; | |
358 print R " phred <- round(-10*log(pnorm(-abs(z))),digits=0)\n"; | |
359 print R " if (phred > 9999) phred <- 9999\n"; | |
360 print R " frac <- counts[i,j]/counts[i,'TotalDepth']\n"; | |
361 print R " for (k in 1:ploidy) {\n"; | |
362 print R " if (frac >= lower[k] && frac < higher[k]) {\n"; | |
363 print R " sample <- paste(paste(paste(rep(0,(ploidy-k)),sep='',collapse='/'),paste(rep(1,k),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n"; | |
364 print R " af <- k/ploidy\n"; | |
365 print R " break\n"; | |
366 print R " }\n"; | |
367 print R " }\n"; | |
368 print R " if (frac < lower[1]) {\n"; | |
369 print R " sample <- paste(paste(paste(rep(0,(ploidy-1)),sep='',collapse='/'),paste(rep(1,1),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n"; | |
370 print R " af <- 1/ploidy\n"; | |
371 print R " filter <- 'LowFraction'\n"; | |
372 print R " }\n"; | |
373 print R " if (counts[i,'TotalDepth'] < $mincov) {\n"; | |
374 print R " filter <- 'LowCoverage'\n"; | |
375 print R " }\n"; | |
376 print R " info <- paste('DP=',counts[i,'TotalDepth'],';AF=',round(af,digits=5),';AR=',round(frac,digits=5),sep='')\n"; | |
377 print R " snpids <- which(snps\$chr == chr & snps\$pos == counts[i,'pos'])\n"; | |
378 print R " id <- '.'\n"; | |
379 print R " if (length(snpids) > 0) id <- snps[snpids[1],'id']\n"; | |
380 print R " vcf[length(vcf)+1] <- paste(chr,counts[i,'pos'],id,counts[i,'ref'],j,phred,filter,info,'GT:AD',sample,sep='\\t',collapse='')\n"; | |
381 print R " }\n"; | |
382 print R " }\n"; | |
383 print R " }\n"; | |
384 print R "}\n"; | |
385 print R "if (length(vcf) > 0) {\n"; | |
386 print R " write(file=args[4],paste(vcf,sep='\\n'))\n"; | |
387 print R "}\n"; | |
388 close R; | |
389 system("mkdir $wd/VCF/"); | |
390 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
391 ${'thr'.$i} = threads->create('CallSNPs'); | |
392 } | |
393 ## end the threads. | |
394 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
395 $targets_three->enqueue(undef); | |
396 } | |
397 | |
398 for (my $i = 1; $i <= $opts{'p'}; $i++) { | |
399 ${'thr'.$i}->join(); | |
400 } | |
401 | |
402 ## BUILD FINAL VCF | |
403 open OUT, ">$outfile"; | |
404 print OUT "##fileformat=VCFv4.1\n"; | |
405 print OUT "##source=High_Ploidy_Genotyper_v.0.1\n"; | |
406 print OUT "##genome_reference=$twobit\n"; | |
407 if ($snpfile ne '') { | |
408 print OUT "##SNP_file=$snpfile\n"; | |
409 } | |
410 foreach(keys(%thash)) { | |
411 print OUT "##contig=<ID=$_,assembly=hg19,species=\"Homo Sapiens\">\n"; | |
412 } | |
413 print OUT "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n"; | |
414 print OUT "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n"; | |
415 print OUT "##INFO=<ID=AR,Number=1,Type=Float,Description=\"Allelic Ratio\">\n"; | |
416 print OUT "##FILTER=<ID=LowFraction,Description=\"Allelic Fraction under 1/2*$ploidy\">\n"; | |
417 print OUT "##FILTER=<ID=LowCoverage,Description=\"Total Depth is lower than threshold of $mincov\">\n"; | |
418 print OUT "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"; | |
419 print OUT "##FORMAT=<ID=AD,Number=2,type=Integer,Description,\"Allelic Depth\">\n"; | |
420 print OUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"; | |
421 close OUT; | |
422 @i = ( 1 .. 22,'X','Y','M' ); | |
423 foreach(@i) { | |
424 my $chr = "chr$_"; | |
425 foreach(sort {$a <=> $b} keys(%{$thash{$chr}})) { | |
426 my $v = "$wd/VCF/$chr.$_-".$thash{$chr}{$_}.".vcf"; | |
427 if (-e $v) { | |
428 system("cat '$v' >> '$outfile'"); | |
429 } | |
430 } | |
431 } | |
432 | |
433 ## clean up | |
434 system("rm -Rf '$wd'"); | |
435 | |
436 sub FetchFasta { | |
437 while(defined(my $line = $targets_one->dequeue())) { | |
438 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); | |
439 # 2bit is zero based, non-including => decrease start by one | |
440 $startposition = $start - 1; | |
441 my $command = "twoBitToFa -seq=$chr -start=$startposition -end=$stop -noMask $twobit $wd/Fasta/$chr-$start-$stop.fasta"; | |
442 system($command); | |
443 } | |
444 } | |
445 | |
446 sub CountAlleles { | |
447 # local version of hashes | |
448 my $snp = \%dbsnp; | |
449 my %counts; | |
450 $counts{'A'} = ''; | |
451 $counts{'C'} = ''; | |
452 $counts{'G'} = ''; | |
453 $counts{'T'} = ''; | |
454 my %map =('A' => 1,'C' => 2,'G' => 3, 'T' => 4); | |
455 my %options; | |
456 foreach(keys(%map)) { | |
457 my $r = $_; | |
458 foreach(keys(%map)) { | |
459 if ($_ eq $r) { | |
460 next; | |
461 } | |
462 $options{$r.'-'.$_} = ''; | |
463 } | |
464 } | |
465 while (defined(my $line = $targets_two->dequeue())) { | |
466 $out = ''; | |
467 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); | |
468 ## get reference alleles | |
469 my %ref_alleles; | |
470 open FASTA, "$wd/Fasta/$chr-$start-$stop.fasta"; | |
471 my $head = <FASTA>; | |
472 my $seq = ''; | |
473 while (<FASTA>) { | |
474 chomp; | |
475 $seq .= $_; | |
476 } | |
477 close FASTA; | |
478 # this generates a hash of the reference alleles once, instead of substr-calls in every bam, on every iteration. | |
479 for (my $pos = 0; $pos < length($seq); $pos++) { | |
480 $ref_alleles{($pos+$start)} = substr($seq,$pos,1); | |
481 } | |
482 ## get counts. | |
483 my $target = "$chr:$start-$stop"; | |
19
8938f339ed37
Added output of pdf report with distributions
geert-vandeweyer
parents:
17
diff
changeset
|
484 my $command = "igvtools count -w 1 --bases --query '$target' '$wd/input.bam' '$wd/WIGS/$chr-$start-$stop.wig' '$igvgenome' > /dev/null 2>&1"; |
11 | 485 system($command); |
486 open WIG, "$wd/WIGS/$chr-$start-$stop.wig"; | |
487 my $h = <WIG>; | |
488 $h = <WIG>; | |
489 $h = <WIG>; | |
490 my $target_counts = ''; | |
491 while (<WIG>) { | |
492 chomp; | |
493 #my ($pos, $a, $c, $g, $t , $n) = split(/\t/,$_); | |
494 my @p = split(/\t/,$_); | |
495 my $s = $p[1] + $p[2] + $p[3] + $p[4]; | |
496 $target_counts .= "$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$s\n"; | |
497 ## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty). | |
498 if ($s > $mincov && !defined($snp->{$chr.'-'.$p[0]})) { | |
499 ## for model of 'non-reference' | |
500 my $frac = $p[$map{$ref_alleles{$p[0]}}] / $s; | |
501 $counts{$ref_alleles{$p[0]}} .= $frac.','; | |
502 $out .= "$target\t$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\n"; | |
503 ## for each of the options background models | |
504 foreach(keys(%map)) { | |
505 if ($_ eq $ref_alleles{$p[0]}) { | |
506 next; | |
507 } | |
508 $options{$ref_alleles{$p[0]}.'-'.$_} .= ($p[$map{$_}] / $s) .','; | |
509 } | |
510 | |
511 } | |
512 } | |
513 close WIG; | |
514 open OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt"; | |
515 flock(OUT, 2); | |
516 print OUT $out; | |
517 close OUT; | |
518 open OUT, ">$wd/WIGS/$chr.$start-$stop.txt"; | |
519 print OUT $target_counts; | |
520 close OUT; | |
521 | |
522 } | |
523 foreach(keys(%counts)) { | |
524 open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt"; | |
525 flock(OUT,2); | |
526 print OUT $counts{$_}; | |
527 close OUT; | |
528 } | |
529 foreach(keys(%options)) { | |
530 open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt"; | |
531 flock(OUT,2); | |
532 print OUT $options{$_}; | |
533 close OUT; | |
534 } | |
535 } | |
536 | |
537 sub GetDistribution { | |
538 while (defined(my $allele = $alleles->dequeue())) { | |
539 system("sed -i 's/.\$//' '$wd/counts_$allele.$mincov"."x.$hassnp.txt'"); | |
540 open OUT, ">$wd/GetDistribution.$allele.R"; | |
16 | 541 print OUT "\n"; |
11 | 542 print OUT "nt <- '$allele'\n"; |
543 #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n"; | |
544 print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n"; | |
545 print OUT "nr <- length(data)\n"; | |
546 print OUT "avg <- mean(data)\n"; | |
547 print OUT "sdd <- sd(data)\n"; | |
17 | 548 print OUT "write(c(avg,sdd,nr),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n"; |
11 | 549 close OUT; |
550 system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1"); | |
551 } | |
552 } | |
553 | |
554 | |
555 sub CallSNPs { | |
556 while (defined(my $line = $targets_three->dequeue())) { | |
557 # split. | |
558 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); | |
559 my $file = "$wd/WIGS/$chr.$start-$stop.txt"; | |
560 my $ofile = "$wd/VCF/$chr.$start-$stop.vcf"; | |
561 system("cd $wd && Rscript CallSNPs.R '$file' '$chr' '$ploidy' '$ofile' '$wd/dbsnp.txt'"); | |
562 } | |
563 | |
564 } | |
565 |