diff mutspecFilter.pl @ 0:8c682b3a7c5b draft

Uploaded
author iarc
date Tue, 19 Apr 2016 03:07:11 -0400
parents
children 916846f73e25
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mutspecFilter.pl	Tue Apr 19 03:07:11 2016 -0400
@@ -0,0 +1,378 @@
+# !/usr/bin/perl
+
+#-----------------------------------#
+# Author: Maude                     #
+# Script: mutspecFilter.pl          #
+# Last update: 18/03/16             #
+#-----------------------------------#
+
+use strict;
+use warnings;
+use Getopt::Long;
+use Pod::Usage;
+use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
+use File::Path;
+
+################################################################################################################################################################################
+#																																		Filter an Annotaed file with Annovar																		  																 #
+################################################################################################################################################################################
+
+our ($verbose, $man, $help)             = (0, 0, 0);    # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
+our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome.
+our ($output, $refGenome)               = ("", "");     # The path for saving the result; The reference genome to use.
+our ($listAVDB)                         = "empty";      # Text file with the list Annovar databases.
+our ($dir)                              = "";
+
+GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2);
+
+our ($input) = @ARGV;
+
+pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
+pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
+pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
+pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
+
+
+
+# If the dbSNP value is not equal to zero filter using the dbSNP column specify
+our $dbSNP = 0;
+if($dbSNP_value > 0) { $dbSNP = 1; }
+
+
+############ Check flags ############
+if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" }
+
+# Zero databases is specified
+if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) )
+{
+	print STDERR "There is no databases selected for filtering against!!!\nPlease choose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n";
+	exit;
+}
+
+
+
+############ Recover the name of the databases to filter against ############
+my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", "");
+my @tab_protocol = ();
+
+if( ($segDup == 1) || ($esp == 1) || ($thG == 1) )
+{
+	### Recover the name of the column
+	my $protocol = "";
+	ExtractAVDBName($listAVDB, \$protocol);
+	@tab_protocol = split(",", $protocol);
+
+	for(my $i=0; $i<=$#tab_protocol; $i++)
+	{
+		if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; }
+		elsif($tab_protocol[$i] =~ /1000g/)         { $thousandGenome_name = $tab_protocol[$i]; }
+		elsif($tab_protocol[$i] =~ /esp/)           { $espAll_name = $tab_protocol[$i]; }
+	}
+}
+
+
+############ Filter the file ############
+filterAgainstPublicDB();
+
+
+print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n";
+
+
+sub filterAgainstPublicDB
+{
+	open(FILTER, ">", "$output") or die "$!: $output\n";
+
+	open(F1, $input) or die "$!: $input\n";
+	my $header = <F1>; print FILTER $header;
+	while(<F1>)
+	{
+		$_      =~ s/[\r\n]+$//;
+		my @tab = split("\t", $_);
+
+		my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0);
+
+		if($segDup == 1)
+		{
+			my $segDup_value = recoverNumCol($input, $segDup_name);
+			$segDupInfo      = formatSegDupInfo($tab[$segDup_value]);
+			# Replace NA by 0 for making test on the same type of variable
+			$segDupInfo =~ s/NA/0/;
+		}
+		if($esp == 1)
+		{
+			my $espAll_value = recoverNumCol($input, $espAll_name);
+			$espAllInfo      = $tab[$espAll_value];
+			# Replace NA by 0 for making test on the same type of variable
+			$espAllInfo      =~ s/NA/0/;
+		}
+		if($thG == 1)
+		{
+			my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name);
+			# Replace NA by 0 for making test on the same type of variable
+			$thgInfo = $tab[$thousandGenome_value];
+			$thgInfo =~ s/NA/0/;
+		}
+
+
+		##############################
+		#   			One Filter 				 #
+		##############################
+		# Remove all the variants present in dbSNP
+		if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } }
+		# Remove all the variants with a frequency greater than or equal to 0.9  in genomic duplicate segments database
+		if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9)            { print FILTER "$_\n"; } }
+		# Remove all the variants with greater than 0.001 in Exome sequencing project
+		if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) )    { if($espAllInfo <= 0.001)      { print FILTER "$_\n"; } }
+		# Remove all the variants with greater than 0.001 in 1000 genome database
+		if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) )    { if($thgInfo <= 0.001)         { print FILTER "$_\n"; } }
+
+
+		#############################
+		#   			Two Filter 				 #
+		##############################
+		if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) )    { print FILTER "$_\n"; } }
+		if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) )  { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
+		if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) )  { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) )    { print FILTER "$_\n"; } }
+
+		if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) )   { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) )           { print FILTER "$_\n"; } }
+		if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) )                { print FILTER "$_\n"; } }
+
+		if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) )   { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )            { print FILTER "$_\n"; } }
+
+
+		#############################
+		#   		Three Filter 				 #
+		##############################
+		if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) )
+		{ print FILTER "$_\n"; } }
+		if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) )
+		{ print FILTER "$_\n"; } }
+		if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
+		{ print FILTER "$_\n"; } }
+		if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
+		{ print FILTER "$_\n"; } }
+
+
+		#############################
+		#   		FOUR Filter 				 #
+		##############################
+		if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
+		{ print FILTER "$_\n"; } }
+
+	}
+	close F1; close FILTER;
+}
+
+
+sub formatSegDupInfo
+{
+	my ($segDup_info) = @_;
+
+	if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931
+	{
+		my @segDup = split(";", $segDup_info);
+		$segDup[0] =~ /Score=(.+)/;
+		return $1;
+	}
+	else { return $segDup_info; }
+}
+
+
+sub ExtractAVDBName
+{
+	my ($listAVDB, $refS_protocol) = @_;
+
+	open(F1, $listAVDB) or die "$!: $listAVDB\n";
+	while(<F1>)
+	{
+		if ($_ =~ /^#/) { next; }
+
+		$_      =~ s/[\r\n]+$//;
+		my @tab = split("\t", $_);
+
+		# db name like refGenome_dbName.txt
+		if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) )
+		{
+			my $temp = $1;
+			if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; }
+		}
+		# 1000 genome
+		if($tab[0] =~ /sites/)
+		{
+			$tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
+			my ($dbName, $year, $month) = ($1, $2, $3);
+			$dbName =~ tr/A-Z/a-z/;
+
+			# convert the month number into the month name
+			ConvertMonth(\$month);
+
+			my $AVdbName_final = "1000g".$year.$month."_".$dbName;
+
+			if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; }
+		}
+		# ESP
+		if($tab[0] =~ /esp/)
+		{
+			$tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
+			my $AVdbName_final = $1."_".$2;
+
+			if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; }
+		}
+	}
+	close F1;
+
+	sub ConvertMonth
+	{
+		my ($refS_month) = @_;
+
+		if($$refS_month == 1)  { $$refS_month = "janv"; }
+		elsif($$refS_month == 2)  { $$refS_month = "feb"; }
+		elsif($$refS_month == 3)  { $$refS_month = "mar"; }
+		elsif($$refS_month == 4)  { $$refS_month = "apr"; }
+		elsif($$refS_month == 5)  { $$refS_month = "may"; }
+		elsif($$refS_month == 6)  { $$refS_month = "jun"; }
+		elsif($$refS_month == 7)  { $$refS_month = "jul"; }
+		elsif($$refS_month == 8)  { $$refS_month = "aug"; }
+		elsif($$refS_month == 9)  { $$refS_month = "sept"; }
+		elsif($$refS_month == 10) { $$refS_month = "oct"; }
+		elsif($$refS_month == 11) { $$refS_month = "nov"; }
+		elsif($$refS_month == 12) { $$refS_month = "dec"; }
+		else { print STDERR "Month number don't considered\n"; exit; }
+	}
+}
+
+
+sub recoverNumCol
+{
+	my ($input, $name_of_column) = @_;
+
+	# With Annovar updates the databases name changed and are present in an array
+	if( ref($name_of_column) eq "ARRAY" )
+	{
+		my $test = "";
+		my @tab = @$name_of_column;
+		foreach (@tab)
+		{
+			open(F1,$input) or die "$!: $input\n";
+		  # For having the name of the columns
+		  my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
+		  close F1;
+		  # The number of the column
+		  my $name_of_column_NB  = "toto";
+		  for(my $i=0; $i<=$#tab_search_header; $i++)
+		  {
+		    if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; }
+		  }
+		  if($name_of_column_NB eq "toto") { next; }
+		  else                             { return $name_of_column_NB; }
+		}
+		if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
+	}
+	# Only one name is pass
+	else
+	{
+		open(FT,$input) or die "$!: $input\n";
+	  # For having the name of the columns
+	  my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
+	  close FT;
+	  # The number of the column
+	  my $name_of_column_NB  = "toto";
+	  for(my $i=0; $i<=$#tab_search_header; $i++)
+	  {
+	    if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; }
+	  }
+	  if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
+	  else                        { return $name_of_column_NB; }
+	}
+}
+
+=head1 NAME
+
+mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
+
+=head1 SYNOPSIS
+
+	mutspecFilter.pl [arguments] <query-file>
+
+  <query-file>                                   an annotated file
+
+  Arguments:
+        -h,        --help                        print help message
+        -m,        --man                         print complete documentation
+        -v,        --verbose                     use verbose output
+									 --dbSNP <value>               filter against dbSNP database. Specify the number of the dbSNP column in the file
+									 --segDup                      filter against genomic duplicate database
+									 --esp                         filter against Exome Sequencing Project database (only for human)
+									 --thG                         filter against 1000 genome database (onyl for human)
+			  -o,        --outfile <string>            name of output file
+			             --refGenome                   reference genome to use
+			             --pathAVDBList                path to the list of Annovar databases installed
+
+
+Function: Filter out variants present in public databases
+
+ Example: # Filter against dbSNP
+ 					mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
+
+ 					# Filter against the four databases
+ 					mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
+
+
+ Version: 03-2016 (March 2016)
+
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<--help>
+
+print a brief usage message and detailed explanation of options.
+
+=item B<--man>
+
+print the complete manual of the program.
+
+=item B<--verbose>
+
+use verbose output.
+
+=item B<--dbSNP>
+
+Remove all the variants presents in the dbSNP databases
+Specify the number of column containing the annotation
+For human and mouse genome
+
+=item B<--segDup>
+
+Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database
+For human and mouse genome
+
+=item B<--esp>
+
+Remove all the variants with a frequency greater than 0.001 in Exome sequencing project
+For human genome only
+
+=item B<--thG>
+
+Remove all the variants with a frequency greater than 0.001 in 1000 genome database
+
+=item B<--refGenome>
+
+the reference genome to use, could be hg19 or mm9.
+
+=item B<--outfile>
+
+the name of the output file
+
+=item B<--pathAVDBList>
+
+the path to a texte file containing the list of the Annovar databases installed.
+
+=back
+
+=head1 DESCRIPTION
+
+mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
+
+=cut