diff ACorF/Analytic_correlation_filtration.pl @ 2:15430e89c97d draft default tip

Uploaded
author melpetera
date Thu, 07 Nov 2019 03:42:14 -0500
parents 26aa3a8f95ce
children
line wrap: on
line diff
--- a/ACorF/Analytic_correlation_filtration.pl	Wed Oct 30 06:54:20 2019 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,651 +0,0 @@
-#!usr/bin/perl
-
-### Perl modules
-use warnings;
-use strict;
-use Getopt::Long qw(GetOptions); #Creation of script options
-use Pod::Usage qw(pod2usage); #Creation of script options
-
-#Personnal packages
-use FindBin ; ## Allows you to locate the directory of original perl script
-#use lib $FindBin::Bin;
-use lib "$FindBin::Bin/lib";
-use IonFiltration;
-
-my ($file, $mass_file, $opt, $dataMatrix, $combined_DMVM, $repres_opt, $rt_threshold, $mass_threshold, $output_sif, $output_tabular, $correl_threshold, $intensity_threshold, $intensity_pourc); #Options to complete
-
-########################
-### Options and help ###
-########################
-
-GetOptions("f=s"=>\$file, "m=s"=>\$mass_file, "o=s"=>\$opt, "d=s"=>\$dataMatrix, "v=s"=>\$combined_DMVM, "r=s"=>\$repres_opt, "rt=f"=>\$rt_threshold, "mass=f"=>\$mass_threshold, "output_sif=s"=>\$output_sif, "output_tabular=s"=>\$output_tabular, "correl=s"=>\$correl_threshold, "IT=f"=>\$intensity_threshold, "IP=f"=>\$intensity_pourc) or pod2usage(2);
-
-### Check required parameters :
-pod2usage({-message=>q{Mandatory argument '-f' is missing}, -exitval=>1, -verbose=>0}) unless $file;
-#pod2usage({-message=>q{Mandatory argument '-m' is missing}, -exitval=>1, -verbose=>0}) unless $mass_file;
-pod2usage({-message=>q{Mandatory argument '-o' is missing. It correspond to the grouping method for analytical correlation groups formation.
-#It should be a number (1 ; 2 or 3) :
-#	1 : Don't take into acount mass information (only RT) ;
-#	2 : Check that all mass differences are include in a specific list and taking into acount RT information
-#	3 : Check that all mass differences are include in a specific list, ignoring RT information
-#To use the tool without takinf into account mass and RT information, use option 1 and define the RT threshold to 999999999.}, -exitval=>1, -verbose=>0}) unless $opt;
-pod2usage({-message=>q{Mandatory argument '-r' is missing. It correspond to the group representent choosing method for analytical correlation groups formation.
-It should be one of the 3 options below :
-	"mass" : choose the ion with the highest mass as the representant
-	"intensity" : choose the ion with the highest intensity as the representant
-	"mixt" : choose the ion with the highest (mass^2 * intensity) as the representant
-	"max_intensity_max_mass" : choose tha ion witht he highest intenisty among the 5 most intense ions of the group}, -exitval=>1, -verbose=>0}) unless $repres_opt;
-pod2usage({-message=>q{Mandatory argument '-d' is missing}, -exitval=>1, -verbose=>0}) unless $dataMatrix;
-pod2usage({-message=>q{Mandatory argument '-v' is missing}, -exitval=>1, -verbose=>0}) unless $combined_DMVM;
-#pod2usage({-message=>q{Mandatory argument '-rt' is missing}, -exitval=>1, -verbose=>0}) unless $rt_threshold;
-#pod2usage({-message=>q{Mandatory argument '-mass' is missing}, -exitval=>1, -verbose=>0}) unless $mass_threshold;
-pod2usage({-message=>q{Mandatory argument '-correl' is missing}, -exitval=>1, -verbose=>0}) unless $correl_threshold;
-pod2usage({-message=>q{Mandatory argument '-output_tabular' is missing}, -exitval=>1, -verbose=>0}) unless $output_tabular;
-pod2usage({-message=>q{Mandatory argument '-output_sif' is missing}, -exitval=>1, -verbose=>0}) unless $output_sif;
-
-
-#if(($opt != 1) && ($opt != 2) && ($opt != 3)){
-#	print "you must indicate \"1\", \"2\" or \"3\" for the --o otpion\n";
-#	exit;
-#}
-
-	
-
-if(($repres_opt ne "mass") && ($repres_opt ne "intensity") && ($repres_opt ne "mixt") && ($repres_opt ne "max_intensity_max_mass")){
-	print "you must indicate \"mass\", \"intensity\", \"mix\" or \"max_intensity_max_mass\" for the --r otpion\n";
-	exit;
-}
-
-
-
-#########################################################################
-#### Création of a hash containing all adduits and fragments possible ###
-#########################################################################
-
-my %hmass;
-if($opt != 1){
-	%hmass = IonFiltration::MassCollecting($mass_file);
-	
-}
-
-my $refhmass = \%hmass;
-
-print "Création of a hash containing all adduits and fragments possible\n";
-
-	
-########################################################
-### Creation of a sif table + correlation filtration ###
-########################################################
-
-my %hrtmz;
-($output_sif, %hrtmz) = IonFiltration::sifTableCreation($file, $output_sif, $opt, $rt_threshold, $mass_threshold, $correl_threshold, $dataMatrix, $output_tabular, $combined_DMVM, $repres_opt, $intensity_threshold, $intensity_pourc, \%hmass);
-print "Creation of a sif table + correlation filtration done\n";
-
-
-######################################################
-### Analytic correlation filtrering follow options ###
-######################################################
-
-my %hheader_file;
-my %hduplicate;
-	
-my %hcorrelgroup;
-my $groupct=1;
-
-my $linenb3=0;
-my %hheader_line;
-
-
-
-open (F1, $output_sif) or die "Impossible to open $output_sif\n";
-	
-while(my $line = <F1>){
-	my $count=0;
-	chomp $line;
-	my @tline = split(/\t/, $line);
-	my $a = $tline[0];
-	my $b = $tline[2];
-							
-	my $amass=$hrtmz{$a}{mz};
-	my $atemp=$hrtmz{$a}{rt};
-	my $bmass= $hrtmz{$b}{mz};
-	my $btemp=$hrtmz{$b}{rt};
-	my $diff = $amass-$bmass;
-	$diff = abs($diff);
-								
-	### Option 1: Don't take into acount mass information ###
-					
-	if($opt == 1){
-		my $btplus = $btemp + $rt_threshold;
-		my $btmoins = $btemp - $rt_threshold;
-		if(($btmoins <= $atemp) && ($atemp <= $btplus)){
-			foreach my $k (keys %hcorrelgroup){
-				if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
-					$hcorrelgroup{$k}{$a}=1;
-					$hcorrelgroup{$k}{$b}=1;
-					$count++;
-					last;
-				}
-			}
-			if($count == 0){
-				my $groupnb="group".$groupct;
-				$hcorrelgroup{$groupnb}{$a}=1;
-				$hcorrelgroup{$groupnb}{$b}=1;
-				$groupct ++;
-			}
-		}
-	}
-
-									
-								
-	### Option 2: Check that all mass differences are include in a specific list taking into account RT information ###
-									
-	elsif($opt == 2){
-										
-		my $print = 0;
-		foreach my $s (keys %{$refhmass}){
-			foreach my $r (keys %{$refhmass->{$s}}){
-				my $rm = $r - $mass_threshold;
-				my $rp = $r + $mass_threshold;
-				if(($diff <= $rp) && ($diff >= $rm)){
-					if($print == 0){
-						my $btplus = $btemp + $rt_threshold;
-						my $btmoins = $btemp - $rt_threshold;
-														
-						if(($btmoins <= $atemp) && ($atemp <= $btplus)){
-							foreach my $k (keys %hcorrelgroup){
-								if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
-									$hcorrelgroup{$k}{$a}=1;
-									$hcorrelgroup{$k}{$b}=1;
-									$count++;
-									last;
-								}
-							}
-							if($count == 0){
-								my $groupnb="group".$groupct;
-								$hcorrelgroup{$groupnb}{$a}=1;
-								$hcorrelgroup{$groupnb}{$b}=1;
-								$groupct ++;
-							}
-							$print = 1;
-						}
-					}
-				}
-			}
-		}
-	}
-									
-								
-	### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ###
-					
-	elsif($opt == 3){
-										
-		my $print = 0;
-		foreach my $s (keys %{$refhmass}){
-			foreach my $r (keys %{$refhmass->{$s}}){
-				my $rm = $r - $mass_threshold;
-				my $rp = $r + $mass_threshold;
-				if(($diff <= $rp) && ($diff >= $rm)){
-					if($print == 0){
-							
-						foreach my $k (keys %hcorrelgroup){
-							if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
-								$hcorrelgroup{$k}{$a}=1;
-								$hcorrelgroup{$k}{$b}=1;
-								$count++;
-								last;
-							}
-						}
-						if($count == 0){
-							my $groupnb="group".$groupct;
-							$hcorrelgroup{$groupnb}{$a}=1;
-							$hcorrelgroup{$groupnb}{$b}=1;
-							$groupct ++;
-						}
-						$print = 1;
-					}
-				}
-			}
-		}
-	}
-}
-close F1;
-
-print "Analytic correlation filtrering follow options done\n";
-	
-	
-#############################################
-### Join groups that have been subdivided ###
-#############################################
-
-my @tdelete;
-
-foreach my $k (keys %hcorrelgroup){
-	foreach my $i (keys %{$hcorrelgroup{$k}}){
-		foreach my $v (keys %hcorrelgroup){
-			my $count = 0;
-			if ($v ne $k){
-				foreach my $w (keys %{$hcorrelgroup{$v}}){
-					if($w eq $i){
-						$count = 1;
-						push(@tdelete, $v);
-					}	
-				}
-			}
-			if($count == 1){
-				foreach my $w (keys %{$hcorrelgroup{$v}}){
-					$hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
-				}
-				delete($hcorrelgroup{$v});
-			}
-		}
-	}
-}
-	
-foreach my $t (@tdelete){
-	delete($hcorrelgroup{$t});
-}
-
-
-### Do it twice to see if it fix the problem of unmerge groups
-
-foreach my $k (keys %hcorrelgroup){
-	foreach my $i (keys %{$hcorrelgroup{$k}}){
-		foreach my $v (keys %hcorrelgroup){
-			my $count = 0;
-			if ($v ne $k){
-				foreach my $w (keys %{$hcorrelgroup{$v}}){
-					if($w eq $i){
-						$count = 1;
-						push(@tdelete, $v);
-					}	
-				}
-			}
-			if($count == 1){
-				foreach my $w (keys %{$hcorrelgroup{$v}}){
-					$hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
-				}
-				delete($hcorrelgroup{$v});
-			}
-		}
-	}
-}
-	
-foreach my $t (@tdelete){
-	delete($hcorrelgroup{$t});
-}
-	
-print "Join groups that have been subdivided done\n";
-	
-#######################################################
-### Addition of annotation information among groups ###
-#######################################################
-
-foreach my $k (keys %hcorrelgroup){
-	foreach my $i (keys %{$hcorrelgroup{$k}}){
-		foreach my $j (keys %{$hcorrelgroup{$k}}){
-			my $count = 0;
-			if ($i ne $j){
-				
-				my $a = $hrtmz{$i}{mz};
-				my $b = $hrtmz{$j}{mz};
-				
-				my $diff = $a - $b;
-				my $sign;
-				if($diff>0){
-					$sign="+";
-				}
-				if($diff<0){
-					$sign="-";
-				}
-				$diff = abs($diff);
-				
-				foreach my $z (keys %{$refhmass}){
-					
-					foreach my $y (keys %{$refhmass->{$z}}){
-						my $ym = $y - $mass_threshold;
-						my $yp = $y + $mass_threshold;
-						
-						
-						if(($diff <= $yp) && ($diff >= $ym)){
-							my $diff_list = $diff - $y;
-							$diff_list = abs($diff_list);
-							$diff_list = sprintf ("%0.6f", $diff_list);
-							
-							if($hcorrelgroup{$k}{$i} eq 1){
-								my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|";
-								$hcorrelgroup{$k}{$i}=$val;
-								$count ++;
-							}
-							else{
-								if($count == 0){
-									my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|";
-									$hcorrelgroup{$k}{$i}.=$val;
-									$count ++;
-								}
-								else{
-									my $val = $sign."(".$z.")(".$diff_list.")|";
-									$hcorrelgroup{$k}{$i}.=$val;
-									$count ++;
-								}
-							}
-						}
-					}
-				}
-			}
-		}
-	}
-}
-	
-
-print "Addition of annotation information among groups done\n";
-	
-	
-####################################################
-### Choose the representative ion for each group ###
-####################################################
-
-my %hgrouprepres;
-	
-open(F3, $dataMatrix);
-
-while (my $line = <F3>){
-	chomp $line;
-	
-	my @tline = split (/\t/, $line);
-	
-	foreach my $k (keys %hcorrelgroup){
-		foreach my $i (keys %{$hcorrelgroup{$k}}){
-			if($tline[0] eq $i){
-				$hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz};
-				my $intensity;
-				my $nbsubjects=0;
-				for(my $y=1;$y<scalar(@tline);$y++){
-					$intensity += $tline[$y];
-					$nbsubjects ++;
-				}
-				my $meanintensity = $intensity/$nbsubjects;
-				$hgrouprepres{$k}{$i}{intensity}=$meanintensity;
-				$hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity});
-			}
-		}
-	}
-}
-close F3;
-	
-foreach my $z (keys %hgrouprepres){
-	my $max_intensity =  0;
-	my $max_int_ion = "";
-	my $max_mass = 0;
-	my $max_mass_ion = "";
-	my $max_squared = 0;
-	my $max_squared_ion = "";
-	foreach my $w (keys %{$hgrouprepres{$z}}){
-		if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){
-			$max_intensity = $hgrouprepres{$z}{$w}{intensity};
-			$max_int_ion = $w;
-		}
-		if($hgrouprepres{$z}{$w}{mass} > $max_mass){
-			$max_mass = $hgrouprepres{$z}{$w}{mass};
-			$max_mass_ion = $w;
-		}
-		if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){
-			$max_squared = $hgrouprepres{$z}{$w}{squaredmassint};
-			$max_squared_ion = $w;
-		}
-	}
-	
-	my $max_int_max_mass_ion="";
-	
-	if($repres_opt eq "max_intensity_max_mass"){
-		my %hfirst;
-		my $first=0;
-		foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){
-			$first ++;
-			if ($first <= 3){
-				$hfirst{$w} = $hgrouprepres{$z}{$w}{intensity};
-			}
-		}
-		
-		my $first_2 = 0;
-		my $intens_max = 0;
-		my $mass_max = 0;
-		
-		foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){
-			
-			$first_2 ++;
-			if($first_2 == 1){
-				$intens_max = $hfirst{$y};
-				if($intensity_threshold > $intens_max){
-					$intensity_threshold = 0;
-				}
-				$max_int_max_mass_ion = $y;
-				$mass_max = $hgrouprepres{$z}{$y}{mass};
-			}
-			if($hgrouprepres{$z}{$y}{mass} > $mass_max){
-				if($hfirst{$y}>$intensity_threshold){
-					my $a = $intens_max * $intensity_pourc;
-					if($hfirst{$y} > $a){
-						$max_int_max_mass_ion = $y;
-						$mass_max = $hgrouprepres{$z}{$y}{mass};
-					}
-				}
-			}
-		}
-	}
-	
-	$hgrouprepres{$z}{max_int}=$max_int_ion;
-	$hgrouprepres{$z}{max_mass}=$max_mass_ion;
-	$hgrouprepres{$z}{max_squared}=$max_squared_ion;
-	$hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion;
-	
-}
-
-
-print "Choose the representative ion for each group done\n";
-
-#############################################################################
-### Addition of annotation information relative to the representative ion ###
-#############################################################################
-	
-my %hreprescomparison;
-
-my $representative="";
-
-if($opt != 1){
-	foreach my $k (keys %hcorrelgroup){
-		foreach my $i (keys %{$hcorrelgroup{$k}}){
-												
-			if($repres_opt eq "mass"){$representative = $hgrouprepres{$k}{max_mass}}
-			if($repres_opt eq "intensity"){$representative = $hgrouprepres{$k}{max_int}}
-			if($repres_opt eq "mixt"){$representative = $hgrouprepres{$k}{max_squared}}
-			if($repres_opt eq "max_intensity_max_mass"){$representative = $hgrouprepres{$k}{max_int_max_mass}}
-			
-			
-			my $count = 0;
-			if ($i ne $representative){
-					
-				my $a = $hrtmz{$i}{mz};
-				my $b = $hrtmz{$representative}{mz};
-				
-				my $diff = $a - $b;
-				my $sign;
-				if($diff>0){
-					$sign="+";
-				}
-				if($diff<0){
-					$sign="-";
-				}
-				$diff = abs($diff);
-				
-				foreach my $z (keys %{$refhmass}){
-					
-					foreach my $y (keys %{$refhmass->{$z}}){
-						my $ym = $y - $mass_threshold;
-						my $yp = $y + $mass_threshold;
-						
-						if(($diff <= $yp) && ($diff >= $ym)){
-							my $diff_list = $diff - $y;
-							$diff_list = abs($diff_list);
-							$diff_list = sprintf ("%0.4f", $diff_list);
-							if($hcorrelgroup{$k}{$i} eq 1){
-								my $valrep = "[M ".$sign."(".$z.")]|";
-								$hreprescomparison{$k}{$i}{repres_diff}=$valrep;
-								$count ++;
-							}
-							else{
-								if($count == 0){
-									my $valrep = "[M ".$sign."(".$z.")]|";
-									$hreprescomparison{$k}{$i}{repres_diff}.=$valrep;
-									$count ++;
-								}
-								else{
-									my $valrep = "[M ".$sign."(".$z.")]|";
-									$hreprescomparison{$k}{$i}{repres_diff}.=$valrep;
-									$count ++;
-								}
-							}
-						}
-					}
-				}
-			}
-			else{
-				$hreprescomparison{$k}{$i}{repres_diff}="M";
-			}
-		}
-	}
-}
-	
-
-print "Addition of annotation information relative to the representative ion done\n";
-	
-##############################
-### Print in result file ! ###
-##############################
-
-open(F4, ">$output_tabular");
-open(F5, $combined_DMVM);
-
-my $line_nb = 0;
-my %hheader;
-while (my $line = <F5>){
-	chomp $line;
-	
-	
-	my @tline = split (/\t/, $line);
-	
-	if($line_nb == 0){
-		print F4 "$line\tACorF_groups";
-		if($opt == 1){
-			if($repres_opt eq "intensity"){print F4 "\tACorF_filter\tintensity_repres\n"}
-			if($repres_opt eq "mass"){print F4 "\tACorF_filter\tmass_repres\n"}
-			if($repres_opt eq "mixt"){print F4 "\tACorF_filter\tmass2intens_repres\n"}
-			if($repres_opt eq "max_intensity_max_mass"){print F4 "\tACorF_filter\tmax_intensity_max_mass_repres\n"}
-			}
-		else{
-			if($repres_opt eq "intensity"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tintensity_repres\tannotation_relative_to_representative\n"}
-			if($repres_opt eq "mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass_repres\tannotation_relative_to_representative\n"}
-			if($repres_opt eq "mixt"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass2intens_repres\tannotation_relative_to_representative\n"}
-			if($repres_opt eq "max_intensity_max_mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmax_intensity_max_mass_repres\tannotation_relative_to_representative\n"}
-		}
-		
-		
-		### Creation of a header hash
-		for(my $i=0; $i<scalar(@tline);$i++){
-			my $a = $tline[$i];
-			$hheader{$a}=$i;
-		}
-	}
-	
-	else{
-		my $find = 0;
-		foreach my $v (keys %hcorrelgroup){
-			if(defined($hgrouprepres{$v}{$tline[0]})){
-				print F4 "$line\t$v";
-					
-				if($opt != 1){
-					if(defined($hcorrelgroup{$v}{$tline[0]})){
-						print F4 "\t$hcorrelgroup{$v}{$tline[0]}\t";
-							
-					}
-					else{
-						print F4 "\t";
-					}
-				}
-				else{
-					print F4 "\t";
-				}
-					
-				if($repres_opt eq "intensity"){
-					if($tline[0] eq $hgrouprepres{$v}{max_int}){
-						print F4 "1\t";
-					}
-					else{
-						print F4 "0\t";
-					}
-					$find = 1;
-				}
-				if($repres_opt eq "mass"){
-					if($tline[0] eq $hgrouprepres{$v}{max_mass}){
-						print F4 "1\t";
-					}
-					else{
-						print F4 "0\t";
-					}
-					$find = 1;
-				}
-				if($repres_opt eq "mixt"){
-					if($tline[0] eq $hgrouprepres{$v}{max_squared}){
-						print F4 "1\t";
-					}
-					else{
-						print F4 "0\t";
-					}
-					$find = 1;
-				}
-				if($repres_opt eq "max_intensity_max_mass"){
-					if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){
-						print F4 "1\t";
-					}
-					else{
-						print F4 "0\t";
-					}
-					$find = 1;
-				}
-					
-				if($repres_opt eq "intensity"){print F4 "$hgrouprepres{$v}{max_int}\t"}
-				if($repres_opt eq "mass"){print F4 "$hgrouprepres{$v}{max_mass}\t"}
-				if($repres_opt eq "mixt"){print F4 "$hgrouprepres{$v}{max_squared}\t"}
-				if($repres_opt eq "max_intensity_max_mass"){print F4 "$hgrouprepres{$v}{max_int_max_mass}\t"}
-				
-				if($opt != 1){
-					if(defined($hreprescomparison{$v}{$tline[0]}{repres_diff})){
-						print F4 "$hreprescomparison{$v}{$tline[0]}{repres_diff}\n";
-					}
-					else{
-						print F4 "-\n";
-					}
-				}
-				else{
-					print F4 "\n";
-				}
-			}
-		}
-		if($find == 0){
-			$groupct ++;
-			my $group = "group".$groupct;
-			if($opt != 1){
-				print F4 "$line\t$group\t-\t-\t-\t-\n";
-			}
-			else{
-				print F4 "$line\t$group\t-\t-\n";
-			}	
-		}
-	}
-	$line_nb ++;
-}
-
-print "Print in result file done\n";
-
-print "All steps done\n";
-