view mutspecFilter.pl @ 3:14fe7238c6d7 draft

Uploaded
author iarc
date Thu, 28 Apr 2016 03:59:27 -0400
parents 8c682b3a7c5b
children 916846f73e25
line wrap: on
line source

# !/usr/bin/perl

#-----------------------------------#
# Author: Maude                     #
# Script: mutspecFilter.pl          #
# Last update: 18/03/16             #
#-----------------------------------#

use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
use File::Path;

################################################################################################################################################################################
#																																		Filter an Annotaed file with Annovar																		  																 #
################################################################################################################################################################################

our ($verbose, $man, $help)             = (0, 0, 0);    # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome.
our ($output, $refGenome)               = ("", "");     # The path for saving the result; The reference genome to use.
our ($listAVDB)                         = "empty";      # Text file with the list Annovar databases.
our ($dir)                              = "";

GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2);

our ($input) = @ARGV;

pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)



# If the dbSNP value is not equal to zero filter using the dbSNP column specify
our $dbSNP = 0;
if($dbSNP_value > 0) { $dbSNP = 1; }


############ Check flags ############
if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" }

# Zero databases is specified
if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) )
{
	print STDERR "There is no databases selected for filtering against!!!\nPlease choose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n";
	exit;
}



############ Recover the name of the databases to filter against ############
my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", "");
my @tab_protocol = ();

if( ($segDup == 1) || ($esp == 1) || ($thG == 1) )
{
	### Recover the name of the column
	my $protocol = "";
	ExtractAVDBName($listAVDB, \$protocol);
	@tab_protocol = split(",", $protocol);

	for(my $i=0; $i<=$#tab_protocol; $i++)
	{
		if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; }
		elsif($tab_protocol[$i] =~ /1000g/)         { $thousandGenome_name = $tab_protocol[$i]; }
		elsif($tab_protocol[$i] =~ /esp/)           { $espAll_name = $tab_protocol[$i]; }
	}
}


############ Filter the file ############
filterAgainstPublicDB();


print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n";


sub filterAgainstPublicDB
{
	open(FILTER, ">", "$output") or die "$!: $output\n";

	open(F1, $input) or die "$!: $input\n";
	my $header = <F1>; print FILTER $header;
	while(<F1>)
	{
		$_      =~ s/[\r\n]+$//;
		my @tab = split("\t", $_);

		my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0);

		if($segDup == 1)
		{
			my $segDup_value = recoverNumCol($input, $segDup_name);
			$segDupInfo      = formatSegDupInfo($tab[$segDup_value]);
			# Replace NA by 0 for making test on the same type of variable
			$segDupInfo =~ s/NA/0/;
		}
		if($esp == 1)
		{
			my $espAll_value = recoverNumCol($input, $espAll_name);
			$espAllInfo      = $tab[$espAll_value];
			# Replace NA by 0 for making test on the same type of variable
			$espAllInfo      =~ s/NA/0/;
		}
		if($thG == 1)
		{
			my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name);
			# Replace NA by 0 for making test on the same type of variable
			$thgInfo = $tab[$thousandGenome_value];
			$thgInfo =~ s/NA/0/;
		}


		##############################
		#   			One Filter 				 #
		##############################
		# Remove all the variants present in dbSNP
		if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } }
		# Remove all the variants with a frequency greater than or equal to 0.9  in genomic duplicate segments database
		if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9)            { print FILTER "$_\n"; } }
		# Remove all the variants with greater than 0.001 in Exome sequencing project
		if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) )    { if($espAllInfo <= 0.001)      { print FILTER "$_\n"; } }
		# Remove all the variants with greater than 0.001 in 1000 genome database
		if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) )    { if($thgInfo <= 0.001)         { print FILTER "$_\n"; } }


		#############################
		#   			Two Filter 				 #
		##############################
		if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) )    { print FILTER "$_\n"; } }
		if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) )  { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
		if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) )  { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) )    { print FILTER "$_\n"; } }

		if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) )   { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) )           { print FILTER "$_\n"; } }
		if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) )                { print FILTER "$_\n"; } }

		if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) )   { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )            { print FILTER "$_\n"; } }


		#############################
		#   		Three Filter 				 #
		##############################
		if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) )
		{ print FILTER "$_\n"; } }
		if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) )
		{ print FILTER "$_\n"; } }
		if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
		{ print FILTER "$_\n"; } }
		if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
		{ print FILTER "$_\n"; } }


		#############################
		#   		FOUR Filter 				 #
		##############################
		if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
		{ print FILTER "$_\n"; } }

	}
	close F1; close FILTER;
}


sub formatSegDupInfo
{
	my ($segDup_info) = @_;

	if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931
	{
		my @segDup = split(";", $segDup_info);
		$segDup[0] =~ /Score=(.+)/;
		return $1;
	}
	else { return $segDup_info; }
}


sub ExtractAVDBName
{
	my ($listAVDB, $refS_protocol) = @_;

	open(F1, $listAVDB) or die "$!: $listAVDB\n";
	while(<F1>)
	{
		if ($_ =~ /^#/) { next; }

		$_      =~ s/[\r\n]+$//;
		my @tab = split("\t", $_);

		# db name like refGenome_dbName.txt
		if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) )
		{
			my $temp = $1;
			if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; }
		}
		# 1000 genome
		if($tab[0] =~ /sites/)
		{
			$tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
			my ($dbName, $year, $month) = ($1, $2, $3);
			$dbName =~ tr/A-Z/a-z/;

			# convert the month number into the month name
			ConvertMonth(\$month);

			my $AVdbName_final = "1000g".$year.$month."_".$dbName;

			if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; }
		}
		# ESP
		if($tab[0] =~ /esp/)
		{
			$tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
			my $AVdbName_final = $1."_".$2;

			if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; }
		}
	}
	close F1;

	sub ConvertMonth
	{
		my ($refS_month) = @_;

		if($$refS_month == 1)  { $$refS_month = "janv"; }
		elsif($$refS_month == 2)  { $$refS_month = "feb"; }
		elsif($$refS_month == 3)  { $$refS_month = "mar"; }
		elsif($$refS_month == 4)  { $$refS_month = "apr"; }
		elsif($$refS_month == 5)  { $$refS_month = "may"; }
		elsif($$refS_month == 6)  { $$refS_month = "jun"; }
		elsif($$refS_month == 7)  { $$refS_month = "jul"; }
		elsif($$refS_month == 8)  { $$refS_month = "aug"; }
		elsif($$refS_month == 9)  { $$refS_month = "sept"; }
		elsif($$refS_month == 10) { $$refS_month = "oct"; }
		elsif($$refS_month == 11) { $$refS_month = "nov"; }
		elsif($$refS_month == 12) { $$refS_month = "dec"; }
		else { print STDERR "Month number don't considered\n"; exit; }
	}
}


sub recoverNumCol
{
	my ($input, $name_of_column) = @_;

	# With Annovar updates the databases name changed and are present in an array
	if( ref($name_of_column) eq "ARRAY" )
	{
		my $test = "";
		my @tab = @$name_of_column;
		foreach (@tab)
		{
			open(F1,$input) or die "$!: $input\n";
		  # For having the name of the columns
		  my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
		  close F1;
		  # The number of the column
		  my $name_of_column_NB  = "toto";
		  for(my $i=0; $i<=$#tab_search_header; $i++)
		  {
		    if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; }
		  }
		  if($name_of_column_NB eq "toto") { next; }
		  else                             { return $name_of_column_NB; }
		}
		if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
	}
	# Only one name is pass
	else
	{
		open(FT,$input) or die "$!: $input\n";
	  # For having the name of the columns
	  my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
	  close FT;
	  # The number of the column
	  my $name_of_column_NB  = "toto";
	  for(my $i=0; $i<=$#tab_search_header; $i++)
	  {
	    if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; }
	  }
	  if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
	  else                        { return $name_of_column_NB; }
	}
}

=head1 NAME

mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)

=head1 SYNOPSIS

	mutspecFilter.pl [arguments] <query-file>

  <query-file>                                   an annotated file

  Arguments:
        -h,        --help                        print help message
        -m,        --man                         print complete documentation
        -v,        --verbose                     use verbose output
									 --dbSNP <value>               filter against dbSNP database. Specify the number of the dbSNP column in the file
									 --segDup                      filter against genomic duplicate database
									 --esp                         filter against Exome Sequencing Project database (only for human)
									 --thG                         filter against 1000 genome database (onyl for human)
			  -o,        --outfile <string>            name of output file
			             --refGenome                   reference genome to use
			             --pathAVDBList                path to the list of Annovar databases installed


Function: Filter out variants present in public databases

 Example: # Filter against dbSNP
 					mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input

 					# Filter against the four databases
 					mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input


 Version: 03-2016 (March 2016)


=head1 OPTIONS

=over 8

=item B<--help>

print a brief usage message and detailed explanation of options.

=item B<--man>

print the complete manual of the program.

=item B<--verbose>

use verbose output.

=item B<--dbSNP>

Remove all the variants presents in the dbSNP databases
Specify the number of column containing the annotation
For human and mouse genome

=item B<--segDup>

Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database
For human and mouse genome

=item B<--esp>

Remove all the variants with a frequency greater than 0.001 in Exome sequencing project
For human genome only

=item B<--thG>

Remove all the variants with a frequency greater than 0.001 in 1000 genome database

=item B<--refGenome>

the reference genome to use, could be hg19 or mm9.

=item B<--outfile>

the name of the output file

=item B<--pathAVDBList>

the path to a texte file containing the list of the Annovar databases installed.

=back

=head1 DESCRIPTION

mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)

=cut