Mercurial > repos > yusuf > miseq_bam_variants
comparison call_miseq_variants @ 0:1a23ea467feb default tip
intial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Thu, 26 Mar 2015 09:36:17 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1a23ea467feb |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 # Locked down procedure for variant calling | |
| 4 use strict; | |
| 5 use warnings; | |
| 6 use File::Basename; | |
| 7 #@ARGV == 7 or die "Usage: $0 <sample name> <output dir> <deduped_realigned_input.bam> <target_regions.bed> <gene_regions.bed> <reference.fasta> <max num processes>\n"; | |
| 8 | |
| 9 # parse configuration file | |
| 10 my $dirname= dirname(__FILE__); | |
| 11 my $tool_data = shift @ARGV; | |
| 12 my %config; | |
| 13 if( not -e "$tool_data/miseq_bam_variants.loc" ){ | |
| 14 system("cp $dirname/tool-data/miseq_bam_variants.loc $tool_data/miseq_bam_variants.loc"); | |
| 15 } | |
| 16 open CONFIG, '<',"$tool_data/miseq_bam_variants.loc"; | |
| 17 while(<CONFIG>){ | |
| 18 (my $key, my $value) = split(/\s+/, $_ ); | |
| 19 $config{$key} = $value; | |
| 20 } | |
| 21 my $dbs_dir = $config{"reference_dbs"}; | |
| 22 my $name_dir = $config{"named_regions_dir"}; | |
| 23 my $capt_dir = $config{"capture_kits_dir"}; | |
| 24 close CONFIG; | |
| 25 | |
| 26 my $sample_name = shift @ARGV; | |
| 27 my $outdir = shift @ARGV; | |
| 28 my $bam = shift @ARGV; | |
| 29 my $bed = $capt_dir . (shift @ARGV); | |
| 30 my $gene_regions = $name_dir . (shift @ARGV); | |
| 31 my $fasta = $dbs_dir . (shift @ARGV); | |
| 32 my $max_processes = shift @ARGV; | |
| 33 | |
| 34 my $log = "$outdir/$sample_name.call_variants.log.txt"; | |
| 35 my $gatk = "$outdir/$sample_name.gatk_haplotypecaller"; | |
| 36 my $freeBayes = "$outdir/$sample_name.freeBayes"; | |
| 37 my $atlas2_indel = "$outdir/$sample_name.atlas2_indel"; | |
| 38 my $phase = "$outdir/$sample_name.phase"; | |
| 39 my $cover = "$outdir/$sample_name.read_coverage"; | |
| 40 my $flagstat = "$outdir/$sample_name.flagstat"; | |
| 41 | |
| 42 my @cmds; | |
| 43 @cmds = ( "$dirname/gatk_haplotypecaller_parallel $bam ".($max_processes > 5 ? int($max_processes*2/3+0.5) : 2)." $gatk.vcf -R $fasta --emitRefConfidence GVCF -variant_index_type LINEAR -variant_index_parameter 128000 &>> $log", | |
| 44 "$dirname/freebayes_parallel $bam ".($max_processes > 5 ? int(($max_processes-1)/3+0.5) : 2)." $freeBayes.vcf -j -C 2 -i -f $fasta -X &>> $log", | |
| 45 "samtools phase -F $bam > $phase.txt 2>> $log"); | |
| 46 &run_cmds($max_processes, @cmds); | |
| 47 | |
| 48 @cmds = ("$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $gatk.vcf -o $gatk.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt", | |
| 49 "$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $freeBayes.vcf -o $freeBayes.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt"); | |
| 50 &run_cmds($max_processes, @cmds); | |
| 51 | |
| 52 my $combined = "$outdir/$sample_name.combined"; | |
| 53 system "$dirname/combine_hgvs_tables -q true $combined.hgvs.txt GATKHaplotypeCaller $gatk.hgvs.txt FreeBayes $freeBayes.hgvs.txt >> $log"; | |
| 54 system "$dirname/hgvs_collapse_transcripts $combined.hgvs.txt $combined.collapsed.hgvs.txt 1 >> $log"; | |
| 55 system "$dirname/split_hgvs_by_confidence $combined.collapsed.hgvs.txt $combined.collapsed.confident.hgvs.txt $combined.collapsed.marginal.hgvs.txt 2 &>> $log"; | |
| 56 | |
| 57 # Check the logs for anything untowards | |
| 58 # e.g. FreeBayes: jumping is disabled | |
| 59 # or any line saying "error" | |
| 60 my @errors; | |
| 61 open(LOG, $log) | |
| 62 or die "Could not check log file for error message, therefore the validity of the genotypes cannot be guaranteed\n"; | |
| 63 while(<LOG>){ | |
| 64 if(/error/i or /jumping is disabled/){ | |
| 65 push @errors, $_; | |
| 66 } | |
| 67 } | |
| 68 close(LOG); | |
| 69 | |
| 70 if(@errors){ | |
| 71 print "Errors were detected in the genotyping log:\n", @errors; | |
| 72 exit 1; # failure | |
| 73 } | |
| 74 exit 0; #success | |
| 75 | |
| 76 sub run_cmds{ | |
| 77 | |
| 78 my ($max_cmds, @cmd) = @_; | |
| 79 | |
| 80 my ($num_children, $pid); | |
| 81 | |
| 82 for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){ | |
| 83 # initialize the number of child processes at 1, and increment it by one | |
| 84 #while it is less than $max_cmds | |
| 85 | |
| 86 my $cmd = shift (@cmds); | |
| 87 if($pid = fork) { | |
| 88 # do nothing if parent | |
| 89 } elsif(defined $pid) { # $pid is zero here if defined | |
| 90 #print STDERR $cmd, "\n"; | |
| 91 system $cmd; | |
| 92 exit; | |
| 93 } else { | |
| 94 #weird fork error | |
| 95 die "Can't fork: $!\n"; | |
| 96 } | |
| 97 } | |
| 98 | |
| 99 while(@cmds) { | |
| 100 undef $pid; | |
| 101 FORK: { | |
| 102 my $cmd = shift (@cmds); | |
| 103 if($pid = fork) { | |
| 104 # parent here | |
| 105 $num_children++; | |
| 106 wait; | |
| 107 $num_children--; | |
| 108 next; | |
| 109 | |
| 110 } elsif(defined $pid) { # $pid is zero here if defined | |
| 111 #print STDERR $cmd, "\n"; | |
| 112 system $cmd; | |
| 113 exit; | |
| 114 | |
| 115 } else { | |
| 116 #weird fork error | |
| 117 die "Can't fork: $!\n"; | |
| 118 } | |
| 119 } | |
| 120 } | |
| 121 wait while $num_children--; | |
| 122 } |
