view AnnotationStatsFromVCF/AnnotationStatsFromVCF.pl @ 11:15b23cdde685 draft

planemo upload commit 305985afd3b7c3d47f531149c2f1a279af2d12aa-dirty
author dereeper
date Fri, 20 Apr 2018 09:04:25 -0400
parents 98c37a5d67f4
children
line wrap: on
line source

#!/usr/bin/perl

use strict;

use Getopt::Long;
use Bio::SeqIO;

my $usage = qq~Usage:$0 <args> [<opts>]
where <args> are:
    -v, --vcf           <VCF input>
    -o, --out           <output>
    -s, --step          <step (in bp)>
~;
$usage .= "\n";

my ($vcf,$out,$step,$min_depth);
$min_depth = 5;

GetOptions(
	"vcf=s"      => \$vcf,
	"out=s"      => \$out,
	"min_depth=s"=> \$min_depth,
	"step=s"     => \$step,
);


die $usage
  if ( !$vcf || !$step || !$out );

if ($step =~/^(\d+)\s*$/){
	$step = $1;
}
else{
	die "Error: step size must be an integer\n";
}


my $VCFTOOLS_EXE = "vcftools"; 
 
my %max;
my %counts_ns;
my %counts_s;
my $nb_gene = 0;
my $nb_intergenic = 0;
my $nb_exon = 0;
my $nb_intron = 0;
my $nb_UTR = 0;
my $nb_syn = 0;
my $nb_nsyn = 0;
if ($vcf =~/\.bcf/){
	system("$VCFTOOLS_EXE --bcf $vcf --recode --recode-INFO-all --out $out");
	$vcf = "$out.recode.vcf";
}
open(my $VCF,$vcf);
while(<$VCF>)
{
	my @infos = split(/\t/,$_);
	if (scalar @infos > 8 && !/#CHROM/)
	{
		my $chrom = $infos[0];
		my $position = $infos[1];
		my $id = $infos[2];
		
		
		my $classe_position = int($position/$step);
		if (/=NON_SYNONYMOUS_CODING/){
			$counts_ns{$chrom}{$classe_position}++;
			$nb_nsyn++;
		}
		if (/=SYNONYMOUS_CODING/){
			$counts_s{$chrom}{$classe_position}++;
			$nb_syn++;
		}
		if (/INTERGENIC/){
			$nb_intergenic++;
		}
		elsif (/EFF=/){
			$nb_gene++;
		}
		if (/CODING/){
			}
		elsif (/UTR/){
			$nb_UTR++;
		}
		elsif (/INTRON/){
			$nb_intron++;
		}
		$max{$chrom} = $classe_position;
	}
}
my $nb_exon = $nb_gene - $nb_intron - $nb_UTR;

open(my $OUT,">$out");
#print $OUT "Chrom	Bin	Nb synonymous SNPs	Nb non-synonymous SNPs	dN/dS ratio\n";
print $OUT "Chrom	Bin	dN/dS ratio\n";
foreach my $chrom(sort keys(%counts_s))
{
	my $maximum = $max{$chrom};
	for(my $i=1;$i<=$maximum;$i++)
	{
		my $classe_position = $i;
		my $nb_s = 0;
		my $nb_ns = 0;
		my $ratio = 0;
		if ($counts_s{$chrom}{$classe_position}){$nb_s = $counts_s{$chrom}{$classe_position};}
		if ($counts_ns{$chrom}{$classe_position}){$nb_ns = $counts_ns{$chrom}{$classe_position};}
		if ($nb_s){$ratio = $nb_ns/$nb_s;}
		my $bin = $classe_position * $step;
		#print $OUT "$chrom	$classe_position	$nb_s	$nb_ns	$ratio\n";
		print $OUT "$chrom	$bin	$ratio\n";
	}
}
close($OUT);



open(my $A2, ">$out.location");
print $A2 "Intergenic	$nb_intergenic	Intergenic:$nb_intergenic\n";
print $A2 "Genic	$nb_gene	Exon:$nb_exon	Intron:$nb_intron	UTR:$nb_UTR\n";
close($A2);

open(my $A2, ">$out.effect");
print $A2 "Intron	$nb_intron	Intron:$nb_intron\n";
print $A2 "UTR	$nb_UTR	UTR:$nb_UTR\n";
print $A2 "Exon	$nb_exon	Synonym:$nb_syn	Non-syn:$nb_nsyn\n";
close($A2);