Mercurial > repos > melpetera > acorf
diff ACorF/Analytic_correlation_filtration.pl @ 2:15430e89c97d draft default tip
Uploaded
author | melpetera |
---|---|
date | Thu, 07 Nov 2019 03:42:14 -0500 |
parents | 26aa3a8f95ce |
children |
line wrap: on
line diff
--- a/ACorF/Analytic_correlation_filtration.pl Wed Oct 30 06:54:20 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,651 +0,0 @@ -#!usr/bin/perl - -### Perl modules -use warnings; -use strict; -use Getopt::Long qw(GetOptions); #Creation of script options -use Pod::Usage qw(pod2usage); #Creation of script options - -#Personnal packages -use FindBin ; ## Allows you to locate the directory of original perl script -#use lib $FindBin::Bin; -use lib "$FindBin::Bin/lib"; -use IonFiltration; - -my ($file, $mass_file, $opt, $dataMatrix, $combined_DMVM, $repres_opt, $rt_threshold, $mass_threshold, $output_sif, $output_tabular, $correl_threshold, $intensity_threshold, $intensity_pourc); #Options to complete - -######################## -### Options and help ### -######################## - -GetOptions("f=s"=>\$file, "m=s"=>\$mass_file, "o=s"=>\$opt, "d=s"=>\$dataMatrix, "v=s"=>\$combined_DMVM, "r=s"=>\$repres_opt, "rt=f"=>\$rt_threshold, "mass=f"=>\$mass_threshold, "output_sif=s"=>\$output_sif, "output_tabular=s"=>\$output_tabular, "correl=s"=>\$correl_threshold, "IT=f"=>\$intensity_threshold, "IP=f"=>\$intensity_pourc) or pod2usage(2); - -### Check required parameters : -pod2usage({-message=>q{Mandatory argument '-f' is missing}, -exitval=>1, -verbose=>0}) unless $file; -#pod2usage({-message=>q{Mandatory argument '-m' is missing}, -exitval=>1, -verbose=>0}) unless $mass_file; -pod2usage({-message=>q{Mandatory argument '-o' is missing. It correspond to the grouping method for analytical correlation groups formation. -#It should be a number (1 ; 2 or 3) : -# 1 : Don't take into acount mass information (only RT) ; -# 2 : Check that all mass differences are include in a specific list and taking into acount RT information -# 3 : Check that all mass differences are include in a specific list, ignoring RT information -#To use the tool without takinf into account mass and RT information, use option 1 and define the RT threshold to 999999999.}, -exitval=>1, -verbose=>0}) unless $opt; -pod2usage({-message=>q{Mandatory argument '-r' is missing. It correspond to the group representent choosing method for analytical correlation groups formation. -It should be one of the 3 options below : - "mass" : choose the ion with the highest mass as the representant - "intensity" : choose the ion with the highest intensity as the representant - "mixt" : choose the ion with the highest (mass^2 * intensity) as the representant - "max_intensity_max_mass" : choose tha ion witht he highest intenisty among the 5 most intense ions of the group}, -exitval=>1, -verbose=>0}) unless $repres_opt; -pod2usage({-message=>q{Mandatory argument '-d' is missing}, -exitval=>1, -verbose=>0}) unless $dataMatrix; -pod2usage({-message=>q{Mandatory argument '-v' is missing}, -exitval=>1, -verbose=>0}) unless $combined_DMVM; -#pod2usage({-message=>q{Mandatory argument '-rt' is missing}, -exitval=>1, -verbose=>0}) unless $rt_threshold; -#pod2usage({-message=>q{Mandatory argument '-mass' is missing}, -exitval=>1, -verbose=>0}) unless $mass_threshold; -pod2usage({-message=>q{Mandatory argument '-correl' is missing}, -exitval=>1, -verbose=>0}) unless $correl_threshold; -pod2usage({-message=>q{Mandatory argument '-output_tabular' is missing}, -exitval=>1, -verbose=>0}) unless $output_tabular; -pod2usage({-message=>q{Mandatory argument '-output_sif' is missing}, -exitval=>1, -verbose=>0}) unless $output_sif; - - -#if(($opt != 1) && ($opt != 2) && ($opt != 3)){ -# print "you must indicate \"1\", \"2\" or \"3\" for the --o otpion\n"; -# exit; -#} - - - -if(($repres_opt ne "mass") && ($repres_opt ne "intensity") && ($repres_opt ne "mixt") && ($repres_opt ne "max_intensity_max_mass")){ - print "you must indicate \"mass\", \"intensity\", \"mix\" or \"max_intensity_max_mass\" for the --r otpion\n"; - exit; -} - - - -######################################################################### -#### Création of a hash containing all adduits and fragments possible ### -######################################################################### - -my %hmass; -if($opt != 1){ - %hmass = IonFiltration::MassCollecting($mass_file); - -} - -my $refhmass = \%hmass; - -print "Création of a hash containing all adduits and fragments possible\n"; - - -######################################################## -### Creation of a sif table + correlation filtration ### -######################################################## - -my %hrtmz; -($output_sif, %hrtmz) = IonFiltration::sifTableCreation($file, $output_sif, $opt, $rt_threshold, $mass_threshold, $correl_threshold, $dataMatrix, $output_tabular, $combined_DMVM, $repres_opt, $intensity_threshold, $intensity_pourc, \%hmass); -print "Creation of a sif table + correlation filtration done\n"; - - -###################################################### -### Analytic correlation filtrering follow options ### -###################################################### - -my %hheader_file; -my %hduplicate; - -my %hcorrelgroup; -my $groupct=1; - -my $linenb3=0; -my %hheader_line; - - - -open (F1, $output_sif) or die "Impossible to open $output_sif\n"; - -while(my $line = <F1>){ - my $count=0; - chomp $line; - my @tline = split(/\t/, $line); - my $a = $tline[0]; - my $b = $tline[2]; - - my $amass=$hrtmz{$a}{mz}; - my $atemp=$hrtmz{$a}{rt}; - my $bmass= $hrtmz{$b}{mz}; - my $btemp=$hrtmz{$b}{rt}; - my $diff = $amass-$bmass; - $diff = abs($diff); - - ### Option 1: Don't take into acount mass information ### - - if($opt == 1){ - my $btplus = $btemp + $rt_threshold; - my $btmoins = $btemp - $rt_threshold; - if(($btmoins <= $atemp) && ($atemp <= $btplus)){ - foreach my $k (keys %hcorrelgroup){ - if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ - $hcorrelgroup{$k}{$a}=1; - $hcorrelgroup{$k}{$b}=1; - $count++; - last; - } - } - if($count == 0){ - my $groupnb="group".$groupct; - $hcorrelgroup{$groupnb}{$a}=1; - $hcorrelgroup{$groupnb}{$b}=1; - $groupct ++; - } - } - } - - - - ### Option 2: Check that all mass differences are include in a specific list taking into account RT information ### - - elsif($opt == 2){ - - my $print = 0; - foreach my $s (keys %{$refhmass}){ - foreach my $r (keys %{$refhmass->{$s}}){ - my $rm = $r - $mass_threshold; - my $rp = $r + $mass_threshold; - if(($diff <= $rp) && ($diff >= $rm)){ - if($print == 0){ - my $btplus = $btemp + $rt_threshold; - my $btmoins = $btemp - $rt_threshold; - - if(($btmoins <= $atemp) && ($atemp <= $btplus)){ - foreach my $k (keys %hcorrelgroup){ - if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ - $hcorrelgroup{$k}{$a}=1; - $hcorrelgroup{$k}{$b}=1; - $count++; - last; - } - } - if($count == 0){ - my $groupnb="group".$groupct; - $hcorrelgroup{$groupnb}{$a}=1; - $hcorrelgroup{$groupnb}{$b}=1; - $groupct ++; - } - $print = 1; - } - } - } - } - } - } - - - ### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ### - - elsif($opt == 3){ - - my $print = 0; - foreach my $s (keys %{$refhmass}){ - foreach my $r (keys %{$refhmass->{$s}}){ - my $rm = $r - $mass_threshold; - my $rp = $r + $mass_threshold; - if(($diff <= $rp) && ($diff >= $rm)){ - if($print == 0){ - - foreach my $k (keys %hcorrelgroup){ - if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ - $hcorrelgroup{$k}{$a}=1; - $hcorrelgroup{$k}{$b}=1; - $count++; - last; - } - } - if($count == 0){ - my $groupnb="group".$groupct; - $hcorrelgroup{$groupnb}{$a}=1; - $hcorrelgroup{$groupnb}{$b}=1; - $groupct ++; - } - $print = 1; - } - } - } - } - } -} -close F1; - -print "Analytic correlation filtrering follow options done\n"; - - -############################################# -### Join groups that have been subdivided ### -############################################# - -my @tdelete; - -foreach my $k (keys %hcorrelgroup){ - foreach my $i (keys %{$hcorrelgroup{$k}}){ - foreach my $v (keys %hcorrelgroup){ - my $count = 0; - if ($v ne $k){ - foreach my $w (keys %{$hcorrelgroup{$v}}){ - if($w eq $i){ - $count = 1; - push(@tdelete, $v); - } - } - } - if($count == 1){ - foreach my $w (keys %{$hcorrelgroup{$v}}){ - $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; - } - delete($hcorrelgroup{$v}); - } - } - } -} - -foreach my $t (@tdelete){ - delete($hcorrelgroup{$t}); -} - - -### Do it twice to see if it fix the problem of unmerge groups - -foreach my $k (keys %hcorrelgroup){ - foreach my $i (keys %{$hcorrelgroup{$k}}){ - foreach my $v (keys %hcorrelgroup){ - my $count = 0; - if ($v ne $k){ - foreach my $w (keys %{$hcorrelgroup{$v}}){ - if($w eq $i){ - $count = 1; - push(@tdelete, $v); - } - } - } - if($count == 1){ - foreach my $w (keys %{$hcorrelgroup{$v}}){ - $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; - } - delete($hcorrelgroup{$v}); - } - } - } -} - -foreach my $t (@tdelete){ - delete($hcorrelgroup{$t}); -} - -print "Join groups that have been subdivided done\n"; - -####################################################### -### Addition of annotation information among groups ### -####################################################### - -foreach my $k (keys %hcorrelgroup){ - foreach my $i (keys %{$hcorrelgroup{$k}}){ - foreach my $j (keys %{$hcorrelgroup{$k}}){ - my $count = 0; - if ($i ne $j){ - - my $a = $hrtmz{$i}{mz}; - my $b = $hrtmz{$j}{mz}; - - my $diff = $a - $b; - my $sign; - if($diff>0){ - $sign="+"; - } - if($diff<0){ - $sign="-"; - } - $diff = abs($diff); - - foreach my $z (keys %{$refhmass}){ - - foreach my $y (keys %{$refhmass->{$z}}){ - my $ym = $y - $mass_threshold; - my $yp = $y + $mass_threshold; - - - if(($diff <= $yp) && ($diff >= $ym)){ - my $diff_list = $diff - $y; - $diff_list = abs($diff_list); - $diff_list = sprintf ("%0.6f", $diff_list); - - if($hcorrelgroup{$k}{$i} eq 1){ - my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|"; - $hcorrelgroup{$k}{$i}=$val; - $count ++; - } - else{ - if($count == 0){ - my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|"; - $hcorrelgroup{$k}{$i}.=$val; - $count ++; - } - else{ - my $val = $sign."(".$z.")(".$diff_list.")|"; - $hcorrelgroup{$k}{$i}.=$val; - $count ++; - } - } - } - } - } - } - } - } -} - - -print "Addition of annotation information among groups done\n"; - - -#################################################### -### Choose the representative ion for each group ### -#################################################### - -my %hgrouprepres; - -open(F3, $dataMatrix); - -while (my $line = <F3>){ - chomp $line; - - my @tline = split (/\t/, $line); - - foreach my $k (keys %hcorrelgroup){ - foreach my $i (keys %{$hcorrelgroup{$k}}){ - if($tline[0] eq $i){ - $hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz}; - my $intensity; - my $nbsubjects=0; - for(my $y=1;$y<scalar(@tline);$y++){ - $intensity += $tline[$y]; - $nbsubjects ++; - } - my $meanintensity = $intensity/$nbsubjects; - $hgrouprepres{$k}{$i}{intensity}=$meanintensity; - $hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity}); - } - } - } -} -close F3; - -foreach my $z (keys %hgrouprepres){ - my $max_intensity = 0; - my $max_int_ion = ""; - my $max_mass = 0; - my $max_mass_ion = ""; - my $max_squared = 0; - my $max_squared_ion = ""; - foreach my $w (keys %{$hgrouprepres{$z}}){ - if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){ - $max_intensity = $hgrouprepres{$z}{$w}{intensity}; - $max_int_ion = $w; - } - if($hgrouprepres{$z}{$w}{mass} > $max_mass){ - $max_mass = $hgrouprepres{$z}{$w}{mass}; - $max_mass_ion = $w; - } - if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){ - $max_squared = $hgrouprepres{$z}{$w}{squaredmassint}; - $max_squared_ion = $w; - } - } - - my $max_int_max_mass_ion=""; - - if($repres_opt eq "max_intensity_max_mass"){ - my %hfirst; - my $first=0; - foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){ - $first ++; - if ($first <= 3){ - $hfirst{$w} = $hgrouprepres{$z}{$w}{intensity}; - } - } - - my $first_2 = 0; - my $intens_max = 0; - my $mass_max = 0; - - foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){ - - $first_2 ++; - if($first_2 == 1){ - $intens_max = $hfirst{$y}; - if($intensity_threshold > $intens_max){ - $intensity_threshold = 0; - } - $max_int_max_mass_ion = $y; - $mass_max = $hgrouprepres{$z}{$y}{mass}; - } - if($hgrouprepres{$z}{$y}{mass} > $mass_max){ - if($hfirst{$y}>$intensity_threshold){ - my $a = $intens_max * $intensity_pourc; - if($hfirst{$y} > $a){ - $max_int_max_mass_ion = $y; - $mass_max = $hgrouprepres{$z}{$y}{mass}; - } - } - } - } - } - - $hgrouprepres{$z}{max_int}=$max_int_ion; - $hgrouprepres{$z}{max_mass}=$max_mass_ion; - $hgrouprepres{$z}{max_squared}=$max_squared_ion; - $hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion; - -} - - -print "Choose the representative ion for each group done\n"; - -############################################################################# -### Addition of annotation information relative to the representative ion ### -############################################################################# - -my %hreprescomparison; - -my $representative=""; - -if($opt != 1){ - foreach my $k (keys %hcorrelgroup){ - foreach my $i (keys %{$hcorrelgroup{$k}}){ - - if($repres_opt eq "mass"){$representative = $hgrouprepres{$k}{max_mass}} - if($repres_opt eq "intensity"){$representative = $hgrouprepres{$k}{max_int}} - if($repres_opt eq "mixt"){$representative = $hgrouprepres{$k}{max_squared}} - if($repres_opt eq "max_intensity_max_mass"){$representative = $hgrouprepres{$k}{max_int_max_mass}} - - - my $count = 0; - if ($i ne $representative){ - - my $a = $hrtmz{$i}{mz}; - my $b = $hrtmz{$representative}{mz}; - - my $diff = $a - $b; - my $sign; - if($diff>0){ - $sign="+"; - } - if($diff<0){ - $sign="-"; - } - $diff = abs($diff); - - foreach my $z (keys %{$refhmass}){ - - foreach my $y (keys %{$refhmass->{$z}}){ - my $ym = $y - $mass_threshold; - my $yp = $y + $mass_threshold; - - if(($diff <= $yp) && ($diff >= $ym)){ - my $diff_list = $diff - $y; - $diff_list = abs($diff_list); - $diff_list = sprintf ("%0.4f", $diff_list); - if($hcorrelgroup{$k}{$i} eq 1){ - my $valrep = "[M ".$sign."(".$z.")]|"; - $hreprescomparison{$k}{$i}{repres_diff}=$valrep; - $count ++; - } - else{ - if($count == 0){ - my $valrep = "[M ".$sign."(".$z.")]|"; - $hreprescomparison{$k}{$i}{repres_diff}.=$valrep; - $count ++; - } - else{ - my $valrep = "[M ".$sign."(".$z.")]|"; - $hreprescomparison{$k}{$i}{repres_diff}.=$valrep; - $count ++; - } - } - } - } - } - } - else{ - $hreprescomparison{$k}{$i}{repres_diff}="M"; - } - } - } -} - - -print "Addition of annotation information relative to the representative ion done\n"; - -############################## -### Print in result file ! ### -############################## - -open(F4, ">$output_tabular"); -open(F5, $combined_DMVM); - -my $line_nb = 0; -my %hheader; -while (my $line = <F5>){ - chomp $line; - - - my @tline = split (/\t/, $line); - - if($line_nb == 0){ - print F4 "$line\tACorF_groups"; - if($opt == 1){ - if($repres_opt eq "intensity"){print F4 "\tACorF_filter\tintensity_repres\n"} - if($repres_opt eq "mass"){print F4 "\tACorF_filter\tmass_repres\n"} - if($repres_opt eq "mixt"){print F4 "\tACorF_filter\tmass2intens_repres\n"} - if($repres_opt eq "max_intensity_max_mass"){print F4 "\tACorF_filter\tmax_intensity_max_mass_repres\n"} - } - else{ - if($repres_opt eq "intensity"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tintensity_repres\tannotation_relative_to_representative\n"} - if($repres_opt eq "mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass_repres\tannotation_relative_to_representative\n"} - if($repres_opt eq "mixt"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass2intens_repres\tannotation_relative_to_representative\n"} - if($repres_opt eq "max_intensity_max_mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmax_intensity_max_mass_repres\tannotation_relative_to_representative\n"} - } - - - ### Creation of a header hash - for(my $i=0; $i<scalar(@tline);$i++){ - my $a = $tline[$i]; - $hheader{$a}=$i; - } - } - - else{ - my $find = 0; - foreach my $v (keys %hcorrelgroup){ - if(defined($hgrouprepres{$v}{$tline[0]})){ - print F4 "$line\t$v"; - - if($opt != 1){ - if(defined($hcorrelgroup{$v}{$tline[0]})){ - print F4 "\t$hcorrelgroup{$v}{$tline[0]}\t"; - - } - else{ - print F4 "\t"; - } - } - else{ - print F4 "\t"; - } - - if($repres_opt eq "intensity"){ - if($tline[0] eq $hgrouprepres{$v}{max_int}){ - print F4 "1\t"; - } - else{ - print F4 "0\t"; - } - $find = 1; - } - if($repres_opt eq "mass"){ - if($tline[0] eq $hgrouprepres{$v}{max_mass}){ - print F4 "1\t"; - } - else{ - print F4 "0\t"; - } - $find = 1; - } - if($repres_opt eq "mixt"){ - if($tline[0] eq $hgrouprepres{$v}{max_squared}){ - print F4 "1\t"; - } - else{ - print F4 "0\t"; - } - $find = 1; - } - if($repres_opt eq "max_intensity_max_mass"){ - if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){ - print F4 "1\t"; - } - else{ - print F4 "0\t"; - } - $find = 1; - } - - if($repres_opt eq "intensity"){print F4 "$hgrouprepres{$v}{max_int}\t"} - if($repres_opt eq "mass"){print F4 "$hgrouprepres{$v}{max_mass}\t"} - if($repres_opt eq "mixt"){print F4 "$hgrouprepres{$v}{max_squared}\t"} - if($repres_opt eq "max_intensity_max_mass"){print F4 "$hgrouprepres{$v}{max_int_max_mass}\t"} - - if($opt != 1){ - if(defined($hreprescomparison{$v}{$tline[0]}{repres_diff})){ - print F4 "$hreprescomparison{$v}{$tline[0]}{repres_diff}\n"; - } - else{ - print F4 "-\n"; - } - } - else{ - print F4 "\n"; - } - } - } - if($find == 0){ - $groupct ++; - my $group = "group".$groupct; - if($opt != 1){ - print F4 "$line\t$group\t-\t-\t-\t-\n"; - } - else{ - print F4 "$line\t$group\t-\t-\n"; - } - } - } - $line_nb ++; -} - -print "Print in result file done\n"; - -print "All steps done\n"; -